Faster Transformer Operator
nvidia provides examples of custom operator compilation and py invocation for tensorrt and tensorflow in open source FasterTransformer code, as detailed in FasterTransformer.py. But if it is inconvenient to use tensorflow's custom operator, its batch size and sequence length are fixed. There is now a way to make it dynamic, as follows:
- Modify bert_transformer_op.cc to remove batch_size, from_seq_len, to_seq_len attr attr attributes and rename them input parameters. The code is as follows:
.Input("output_bias: T") .Input("output_layernorm_beta: T") .Input("output_layernorm_gamma: T") + .Input("batch_size: int32") + .Input("from_seq_len: int32") .Output("output: T") .Attr("T: {float, half}") - .Attr("batch_size: int >= 1") - .Attr("from_seq_len: int >= 1") - .Attr("to_seq_len: int >= 1") + //.Attr("batch_size: int >= 1") + //.Attr("from_seq_len: int >= 1") + //.Attr("to_seq_len: int >= 1") .Attr("head_num: int >= 1") .Attr("size_per_head: int >= 1") .SetShapeFn([](shape_inference::InferenceContext *c) { int batch_size, from_seq_len, to_seq_len, head_num, size_per_head; - c->GetAttr("batch_size", &batch_size); - c->GetAttr("from_seq_len", &from_seq_len); - c->GetAttr("to_seq_len", &to_seq_len); + //c->GetAttr("batch_size", &batch_size); + //c->GetAttr("from_seq_len", &from_seq_len); + //c->GetAttr("to_seq_len", &to_seq_len); c->GetAttr("head_num", &head_num); c->GetAttr("size_per_head", &size_per_head); - c->set_output(0, c->MakeShape({batch_size * from_seq_len, head_num * size_per_head})); + //c->set_output(0, c->MakeShape({batch_size * from_seq_len, head_num * size_per_head})); + c->set_output(0, c->input(0)); return Status::OK(); }); template <typename Device, typename T> @@ -70,14 +71,15 @@ class BertTransformerOp : public OpKernel public: explicit BertTransformerOp(OpKernelConstruction *context) : OpKernel(context) { - OP_REQUIRES_OK(context, context->GetAttr("batch_size", &batch_size_)); - OP_REQUIRES_OK(context, context->GetAttr("from_seq_len", &from_seq_len_)); - OP_REQUIRES_OK(context, context->GetAttr("to_seq_len", &to_seq_len_)); + //OP_REQUIRES_OK(context, context->GetAttr("batch_size", &batch_size_)); + //OP_REQUIRES_OK(context, context->GetAttr("from_seq_len", &from_seq_len_)); + //OP_REQUIRES_OK(context, context->GetAttr("to_seq_len", &to_seq_len_)); OP_REQUIRES_OK(context, context->GetAttr("head_num", &head_num_)); OP_REQUIRES_OK(context, context->GetAttr("size_per_head", &size_per_head_)); - OP_REQUIRES(context, (from_seq_len_ == to_seq_len_), - errors::InvalidArgument("Only support from_seq_len == to_seq_len")); + //printf("++++++++ %d =%d \n", from_seq_len_, to_seq_len_) + //OP_REQUIRES(context, (from_seq_len_ == to_seq_len_), + /// errors::InvalidArgument("Only support from_seq_len == to_seq_len")); try { @@ -95,6 +97,11 @@ class BertTransformerOp : public OpKernel BertEncoderTransformer<EncoderTraits_> *encoder_transformer_; try { + + batch_size_ = context->input(19).flat<int32>().size()/3; + from_seq_len_ = context->input(20).flat<int32>().size()/3; + to_seq_len_ = from_seq_len_; + //printf("==>%d %d\n", batch_size_, from_seq_len_); fastertransformer::Allocator<AllocatorType::TF> allocator_(context); encoder_transformer_ = new BertEncoderTransformer<EncoderTraits_>(allocator_, batch_size_, from_seq_len_, to_seq_len_, head_num_, size_per_head_); @@ -104,7 +111,7 @@ class BertTransformerOp : public OpKernel OP_REQUIRES(context, false, errors::Internal(error.what())); } - OP_REQUIRES(context, context->num_inputs() == 19, errors::InvalidArgument("Less input arguments")); + OP_REQUIRES(context, context->num_inputs() == 21, errors::InvalidArgument("Less input arguments")); EncoderInitParam<DataType_> param; //init param here
Because input is in cuda's memory, it is impossible to read input's value directly (it is time-consuming to read the value from the memory copy), but we can read the size of shape directly in memory. We forge a size of shape to get batch_size and seq_len through this size.
- FasterTransformer.py was amended as follows:
... fast_list_tensor = tf.shape(input_tensor) ... layer_output = transformer_op_module.bert_transformer( layer_input, layer_input, trainable_vars[0], trainable_vars[2], trainable_vars[4], trainable_vars[1], trainable_vars[3], trainable_vars[5], attention_mask, trainable_vars[6], trainable_vars[7], trainable_vars[8], trainable_vars[9], trainable_vars[10], trainable_vars[11], trainable_vars[12], trainable_vars[13], trainable_vars[14], trainable_vars[15], tf.tile([[1],[2],[3]], [1,fast_list_tensor[0]]), tf.tile([[1],[2],[3]], [1,fast_list_tensor[1]]), #batch_size=batch_size, #from_seq_len=seq_length, #to_seq_len=seq_length, head_num=num_attention_heads, size_per_head=attention_head_size)
- With the above modifications, when we use transformer_op_module, we do not need to force batch size and seq length to be specified, which means that when we generate the model, we configure it as follows:
input_ids = tf.placeholder(tf.int32,(None, None), 'input_ids') input_mask = tf.placeholder(tf.float32,(None, None), 'input_mask') input_type_ids = tf.placeholder(tf.int32,(None, None), 'input_type_ids')
The tensorflow model supporting dynamic batch and dynamic seq len can be generated.