How to enable FasterTransformer to support dynamic batch and dynamic sequence length

Keywords: Linux less

Faster Transformer Operator

nvidia provides examples of custom operator compilation and py invocation for tensorrt and tensorflow in open source FasterTransformer code, as detailed in FasterTransformer.py. But if it is inconvenient to use tensorflow's custom operator, its batch size and sequence length are fixed. There is now a way to make it dynamic, as follows:

  1. Modify bert_transformer_op.cc to remove batch_size, from_seq_len, to_seq_len attr attr attributes and rename them input parameters. The code is as follows:
   .Input("output_bias: T")
   .Input("output_layernorm_beta: T")
   .Input("output_layernorm_gamma: T")
+  .Input("batch_size: int32")
+  .Input("from_seq_len: int32")
   .Output("output: T")
   .Attr("T: {float, half}")
-  .Attr("batch_size: int >= 1")
-  .Attr("from_seq_len: int >= 1")
-  .Attr("to_seq_len: int >= 1")
+  //.Attr("batch_size: int >= 1")
+  //.Attr("from_seq_len: int >= 1")
+  //.Attr("to_seq_len: int >= 1")
   .Attr("head_num: int >= 1")
   .Attr("size_per_head: int >= 1")
   .SetShapeFn([](shape_inference::InferenceContext *c) {
       int batch_size, from_seq_len, to_seq_len, head_num, size_per_head;
-      c->GetAttr("batch_size", &batch_size);
-      c->GetAttr("from_seq_len", &from_seq_len);
-      c->GetAttr("to_seq_len", &to_seq_len);
+      //c->GetAttr("batch_size", &batch_size);
+      //c->GetAttr("from_seq_len", &from_seq_len);
+      //c->GetAttr("to_seq_len", &to_seq_len);
       c->GetAttr("head_num", &head_num);
       c->GetAttr("size_per_head", &size_per_head);
-      c->set_output(0, c->MakeShape({batch_size * from_seq_len, head_num * size_per_head}));
+      //c->set_output(0, c->MakeShape({batch_size * from_seq_len, head_num * size_per_head}));
+      c->set_output(0, c->input(0));
       return Status::OK();
       });
 template <typename Device, typename T>
@@ -70,14 +71,15 @@ class BertTransformerOp : public OpKernel
   public:
     explicit BertTransformerOp(OpKernelConstruction *context) : OpKernel(context)
     {
-      OP_REQUIRES_OK(context, context->GetAttr("batch_size", &batch_size_));
-      OP_REQUIRES_OK(context, context->GetAttr("from_seq_len", &from_seq_len_));
-      OP_REQUIRES_OK(context, context->GetAttr("to_seq_len", &to_seq_len_));
+      //OP_REQUIRES_OK(context, context->GetAttr("batch_size", &batch_size_));
+      //OP_REQUIRES_OK(context, context->GetAttr("from_seq_len", &from_seq_len_));
+      //OP_REQUIRES_OK(context, context->GetAttr("to_seq_len", &to_seq_len_));
       OP_REQUIRES_OK(context, context->GetAttr("head_num", &head_num_));
       OP_REQUIRES_OK(context, context->GetAttr("size_per_head", &size_per_head_));
 
-      OP_REQUIRES(context, (from_seq_len_ == to_seq_len_),
-          errors::InvalidArgument("Only support from_seq_len == to_seq_len"));
+      //printf("++++++++ %d =%d \n", from_seq_len_, to_seq_len_)
+      //OP_REQUIRES(context, (from_seq_len_ == to_seq_len_),
+      ///    errors::InvalidArgument("Only support from_seq_len == to_seq_len"));
 
       try
       {
@@ -95,6 +97,11 @@ class BertTransformerOp : public OpKernel
       BertEncoderTransformer<EncoderTraits_> *encoder_transformer_;
       try
       {
+     
+        batch_size_ = context->input(19).flat<int32>().size()/3;
+        from_seq_len_ = context->input(20).flat<int32>().size()/3;
+        to_seq_len_ = from_seq_len_;
+        //printf("==>%d %d\n", batch_size_, from_seq_len_);
         fastertransformer::Allocator<AllocatorType::TF> allocator_(context);
         encoder_transformer_ = new BertEncoderTransformer<EncoderTraits_>(allocator_, 
           batch_size_, from_seq_len_, to_seq_len_, head_num_, size_per_head_);
@@ -104,7 +111,7 @@ class BertTransformerOp : public OpKernel
         OP_REQUIRES(context, false, errors::Internal(error.what()));
       }
       
-      OP_REQUIRES(context, context->num_inputs() == 19, errors::InvalidArgument("Less input arguments"));
+      OP_REQUIRES(context, context->num_inputs() == 21, errors::InvalidArgument("Less input arguments"));
 
       EncoderInitParam<DataType_> param; //init param here

Because input is in cuda's memory, it is impossible to read input's value directly (it is time-consuming to read the value from the memory copy), but we can read the size of shape directly in memory. We forge a size of shape to get batch_size and seq_len through this size.

  1. FasterTransformer.py was amended as follows:
    ...
    fast_list_tensor = tf.shape(input_tensor)
    ...
    layer_output = transformer_op_module.bert_transformer(
        layer_input,
        layer_input,
        trainable_vars[0], trainable_vars[2], trainable_vars[4], trainable_vars[1], trainable_vars[3], trainable_vars[5],
        attention_mask,
        trainable_vars[6], trainable_vars[7], trainable_vars[8], trainable_vars[9], trainable_vars[10], trainable_vars[11],
        trainable_vars[12], trainable_vars[13], trainable_vars[14], trainable_vars[15], tf.tile([[1],[2],[3]], [1,fast_list_tensor[0]]),
        tf.tile([[1],[2],[3]], [1,fast_list_tensor[1]]),
        #batch_size=batch_size, 
        #from_seq_len=seq_length, 
        #to_seq_len=seq_length, 
        head_num=num_attention_heads, size_per_head=attention_head_size)
  1. With the above modifications, when we use transformer_op_module, we do not need to force batch size and seq length to be specified, which means that when we generate the model, we configure it as follows:
input_ids = tf.placeholder(tf.int32,(None, None), 'input_ids')
input_mask = tf.placeholder(tf.float32,(None, None), 'input_mask')
input_type_ids = tf.placeholder(tf.int32,(None, None), 'input_type_ids')

The tensorflow model supporting dynamic batch and dynamic seq len can be generated.

Posted by dirty_n4ppy on Thu, 03 Oct 2019 23:15:47 -0700