diff --git a/.dockerignore b/.dockerignore index a7470203e3..b9f228c009 100644 --- a/.dockerignore +++ b/.dockerignore @@ -17,3 +17,5 @@ python/flexflow/core/legion_cffi_header.py /inference/tokenizer/* /inference/prompt/* /inference/output/* + +/tests/inference/python_test_configs/*.json diff --git a/.gitignore b/.gitignore index 8fcc105f01..7f6a3c4137 100644 --- a/.gitignore +++ b/.gitignore @@ -186,4 +186,5 @@ gpt_tokenizer # pip version python/flexflow/version.txt -inference_tensors \ No newline at end of file +inference_tensors +tests/inference/python_test_configs/*.json diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 4ae62a945d..dd6dc76b4d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1091,7 +1091,7 @@ class FFModel { std::unordered_map>> get_bwd_edge_map() const; - // Internal funcitons + // Internal functions Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc); Legion::IndexSpace get_or_create_task_is(MachineView const &view); Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain); @@ -1099,6 +1099,7 @@ class FFModel { Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool is_mlp_block(int layer_idx) const; void create_operators_from_layers(); Op *create_operator_from_layer(Layer *layer, std::vector const &inputs); diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 34a2084253..a6edf40837 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -196,7 +196,7 @@ void OPT::create_opt_model(FFModel &ff, Tensor fc1 = ff.dense(final_norm, opt_config.ffn_dim, - AC_MODE_NONE, + AC_MODE_RELU, true, DT_NONE, nullptr, @@ -205,8 +205,7 @@ void OPT::create_opt_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers_" + std::to_string(i) + "_fc1").c_str()); - Tensor activation = ff.relu(fc1, false); - fc2 = ff.dense(activation, + fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, true, diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index d8c349911c..7465fc6115 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -56,7 +56,7 @@ def get_c_name(name): if name is None: return ffi.NULL else: - return ffi.new("char[]", name.encode("ascii")) + return ffi.new("char[]", name.encode("utf-8")) def get_datatype_size(datatype): diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index dfd1cde7d4..dd36fa6592 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -216,13 +216,12 @@ def build_model(self, max_tokens_per_batch): fc1 = ffmodel.dense( ff_norm, self.opt_config.ffn_dim, - ActiMode.AC_MODE_NONE, + ActiMode.AC_MODE_RELU, True, name=f"layers_{i}_fc1", ) - activation = ffmodel.relu(fc1, False) fc2 = ffmodel.dense( - activation, + fc1, self.opt_config.hidden_size, ActiMode.AC_MODE_NONE, True, diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index fb014ace50..6132e8a13c 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1597,7 +1597,11 @@ flexflow_generation_result_t handle->generate(prompts, max_seq_length); DEBUG_PRINT( "[Model] generate %p %s %i", handle, text_str.c_str(), max_seq_length); - assert(results[0].output_tokens.size() <= max_seq_length); + // If the prompt exceeds max seq len, check that we return the prompt with no + // additional token. Otherwise, check that the output does not exceed the max + // sequence length. + assert(results[0].output_tokens.size() <= max_seq_length || + results[0].output_tokens.size() == results[0].input_tokens.size()); output_length_and_tokens[0] = results[0].output_tokens.size(); std::copy(results[0].output_tokens.begin(), results[0].output_tokens.end(), diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 9373c2fb2f..c30c9f71c1 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -252,6 +252,18 @@ Parameter* Linear::get_parameter(int index) */ namespace Internal { +template +__global__ void AddBiasWithReLU(DT *output_ptr, + DT const *bias_ptr, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, out_dim * batch_size) { + int bias_idx = i % out_dim; + DT value = output_ptr[i] + bias_ptr[bias_idx]; + output_ptr[i] = ((float)value > 0.0f) ? value : (DT)0.0f; + } +} + template void forward_kernel(LinearMeta const *m, void const *input_ptr, @@ -343,6 +355,16 @@ void forward_kernel(LinearMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // use_bias = True if (bias_ptr != NULL) { + // fuse bias and relu + if (m->activation == AC_MODE_RELU) { + int parallelism = out_dim * batch_size; + AddBiasWithReLU<<>>( + static_cast
(output_ptr), + static_cast
(bias_ptr), + out_dim, + batch_size); + return; + } checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 66679741e9..c07c33efca 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3236,6 +3236,27 @@ Op *FFModel::create_operator_from_layer( } } +bool FFModel::is_mlp_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + // standard opt relu + if (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_RELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) { + return true; + } + // mlp layer with relu embedded in first dense layer + if (l->op_type == OP_LINEAR && layer_idx >= 1 && + layers[layer_idx - 1]->op_type == OP_LINEAR) { + long long value; + layers[layer_idx - 1]->get_int_property("activation", value); + ActiMode activation = (ActiMode)value; + if (activation == AC_MODE_RELU) { + return true; + } + } + return false; +} + void FFModel::create_operators_from_layers() { std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { @@ -3280,9 +3301,9 @@ void FFModel::create_operators_from_layers() { config.tensor_parallelism_degree > 1 && (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - (l->op_type == OP_LINEAR && layer_idx >= 2 && - layers[layer_idx - 1]->op_type == OP_RELU && - layers[layer_idx - 2]->op_type == OP_LINEAR) || + // mlp layer + is_mlp_block(layer_idx) || + // llama mlp layer (l->op_type == OP_LINEAR && layer_idx >= 2 && layers[layer_idx - 1]->op_type == OP_GELU && layers[layer_idx - 2]->op_type == OP_LINEAR) || diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5ebcaaa9ce..24b58286b9 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -45,7 +45,8 @@ std::string LoadBytesFromFile(std::string const &path) { RequestManager::RequestManager() : request_manager_status(INITIALIZED), verbose(false), - next_available_guid(1000000), num_processed_requests(0) { + next_available_guid(1000000), num_processed_requests(0), + total_request_run_time(0.0f){ // The following config parameters are set // during ffmodel.compile() // Initialize them to -1 to make sure no one diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index 3544f58e26..a1ee281914 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -6,6 +6,7 @@ set -e cd "${BASH_SOURCE[0]%/*}" # Generate test configs +rm -rf python_test_configs/*.json python python_test_configs/generate_configs.py # Run all tests