flexflow · jiazhihao · Aug 2, 2023 · Jul 27, 2023 · Jul 28, 2023 · Jul 28, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -321,11 +321,13 @@ list(APPEND FLEXFLOW_INCLUDE_DIRS
 file(GLOB_RECURSE FLEXFLOW_HDR
   LIST_DIRECTORIES False
   ${FLEXFLOW_ROOT}/include/*.h)
+  list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h)
 
 file(GLOB_RECURSE FLEXFLOW_SRC
   LIST_DIRECTORIES False
   ${FLEXFLOW_ROOT}/src/*.cc)
 list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc")
+list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc)
 
 set(FLEXFLOW_CPP_DRV_SRC
   ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
@@ -460,6 +462,7 @@ if (FF_USE_PYTHON)
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
   )
   # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
+  # create set_python_envs.sh script to set up the environment variables to run flexflow_python
   if (NOT FF_BUILD_FROM_PYPI)
     add_custom_command(TARGET flexflow
       PRE_BUILD	

diff --git a/FlexFlow.mk b/FlexFlow.mk
@@ -59,7 +59,8 @@ GEN_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cc')\
 		$(shell find $(FF_HOME)/src/runtime/ -name '*.cc')\
 		$(shell find $(FF_HOME)/src/utils/dot/ -name '*.cc')\
 		$(shell find $(FF_HOME)/src/dataloader/ -name '*.cc')\
-		$(shell find $(FF_HOME)/src/c/ -name '*.cc')
+		$(shell find $(FF_HOME)/src/c/ -name '*.cc')\
+		$(shell find $(FF_HOME)/inference/ -name 'file_loader.cc')
 GEN_SRC := $(filter-out $(FF_HOME)/src/runtime/cpp_driver.cc, $(GEN_SRC))
 
 FF_CUDA_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cu')\
@@ -94,7 +95,7 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1)
 endif
 
 
-INC_FLAGS	+= -I${FF_HOME}/include -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src
+INC_FLAGS	+= -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src
 CC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
 NVCC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
 HIPCC_FLAGS     += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768

diff --git a/INSTALL.md b/INSTALL.md
@@ -85,10 +85,11 @@ export FF_HOME=/path/to/FlexFlow
 ### Run FlexFlow Python examples
 The Python examples are in the [examples/python](https://github.com/flexflow/FlexFlow/tree/master/examples/python). The native, Keras integration and PyTorch integration examples are listed in `native`, `keras` and `pytorch` respectively.
 
-To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `build` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the following flags:
+To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `build` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the required environment flags by running the following command (edit the path if your build folder is not named `build`):
 
-* `export PYTHONPATH="${FF_HOME}/python:${FF_HOME}/build/deps/legion/bindings/python:${PYTHONPATH}"`
-* `export LD_LIBRARY_PATH="${FF_HOME}/build:${FF_HOME}/build/deps/legion/lib:${LD_LIBRARY_PATH}"`
+```
+source ./build/set_python_envs.sh
+```
 
 **We recommend that you run the** `mnist_mlp` **test under** `native` **using the following cmd to check if FlexFlow has been installed correctly:**
 

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
@@ -180,7 +180,7 @@ enum OperatorType {
   OP_INVALID,
 };
 
-enum ModelType { UNKNOWN, LLAMA, OPT, FALCON };
+enum ModelType { UNKNOWN = 3001, LLAMA = 3002, OPT = 3003, FALCON = 3004 };
 
 enum PMParameter {
   PM_OP_TYPE,            // AnyOp

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
@@ -53,6 +53,8 @@ FF_NEW_OPAQUE_TYPE(flexflow_tree_verify_batch_config_t);
 FF_NEW_OPAQUE_TYPE(flexflow_beam_search_batch_config_t);
 FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t);
+FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t);
+FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);
 
 // -----------------------------------------------------------------------
 // FFConfig
@@ -78,6 +80,21 @@ int flexflow_config_get_epochs(flexflow_config_t handle);
 
 bool flexflow_config_get_enable_control_replication(flexflow_config_t handle);
 
+int flexflow_config_get_data_parallelism_degree(flexflow_config_t handle_);
+
+int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_);
+
+int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_);
+
+void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_,
+                                                 int value);
+
+void flexflow_config_set_tensor_parallelism_degree(flexflow_config_t handle_,
+                                                   int value);
+
+void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_,
+                                                     int value);
+
 int flexflow_config_get_python_data_loader_type(flexflow_config_t handle);
 
 // -----------------------------------------------------------------------
@@ -390,8 +407,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention(
     bool bias,
     bool add_bias_kv,
     bool add_zero_attn,
+    enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention(
@@ -405,8 +426,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention(
     bool bias,
     bool add_bias_kv,
     bool add_zero_attn,
+    enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
@@ -420,8 +445,27 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     bool bias,
     bool add_bias_kv,
     bool add_zero_attn,
+    enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    char const *name);
+
+flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
+    flexflow_model_t handle_,
+    const flexflow_tensor_t input_,
+    int embed_dim,
+    int num_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool bias,
+    bool add_bias_kv,
+    bool add_zero_attn,
+    enum DataType data_type,
+    flexflow_initializer_t kernel_initializer_,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
@@ -447,6 +491,11 @@ flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
                                               float top_p,
                                               char const *name);
 
+flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
+                                            const flexflow_tensor_t input_,
+                                            bool beam_search,
+                                            char const *name);
+
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
                                       flexflow_sgd_optimizer_t optimizer);
 
@@ -468,6 +517,10 @@ flexflow_perf_metrics_t
 
 void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
 
+flexflow_generation_result_t flexflow_model_generate(flexflow_model_t handle_,
+                                                     char const *text,
+                                                     int max_seq_length);
+
 // -----------------------------------------------------------------------
 // Tensor
 // -----------------------------------------------------------------------
@@ -809,44 +862,55 @@ void flexflow_beam_search_batch_config_destroy(
 // RequestManager
 // -----------------------------------------------------------------------
 
-flexflow_request_manager_t flexflow_request_manager_create(void);
+flexflow_request_manager_t flexflow_request_manager_get_request_manager(void);
+
+// void flexflow_request_manager_destroy(flexflow_request_manager_t handle_);
 
-void flexflow_request_manager_destroy(flexflow_request_manager_t handle);
+void flexflow_request_manager_register_tokenizer(
+    flexflow_request_manager_t handle_,
+    enum ModelType model_type,
+    char const *tokenizer_filepath);
 
-long unsigned int flexflow_request_manager_register_new_request(
-    flexflow_request_manager_t handle,
-    char const *prompt,
-    int max_sequence_length);
+void flexflow_request_manager_register_output_filepath(
+    flexflow_request_manager_t handle_, char const *output_filepath);
+
+int flexflow_request_manager_register_ssm_model(
+    flexflow_request_manager_t handle_, flexflow_model_t model_handle_);
 
 // -----------------------------------------------------------------------
 // InferenceManager
 // -----------------------------------------------------------------------
 
 flexflow_inference_manager_t
-    flexflow_inference_manager_create(flexflow_config_t config_handle,
-                                      int max_num_tokens_per_batch);
+    flexflow_inference_manager_get_inference_manager(void);
 
-void flexflow_inference_manager_destroy(flexflow_inference_manager_t handle);
+// void flexflow_inference_manager_destroy(flexflow_inference_manager_t
+// handle_);
 
 void flexflow_inference_manager_compile_model_and_allocate_buffer(
-    flexflow_inference_manager_t handle, flexflow_model_t model_handle);
+    flexflow_inference_manager_t handle_, flexflow_model_t model_handle);
 
 void flexflow_inference_manager_init_operators_inference(
-    flexflow_inference_manager_t handle, flexflow_model_t model_handle);
-
-void flexflow_inference_manager_incr_decoding_loop(
-    flexflow_inference_manager_t handle,
-    flexflow_model_t model_handle,
-    flexflow_request_manager_t rm_handle,
-    int total_num_requests);
-
-void flexflow_inference_manager_spec_inference_loop(
-    flexflow_inference_manager_t handle,
-    flexflow_model_t model_handle,
-    flexflow_request_manager_t rm_handle,
-    int total_num_requests,
-    int num_ssms,
-    int *ssm_model_ids);
+    flexflow_inference_manager_t handle_, flexflow_model_t model_handle);
+
+// -----------------------------------------------------------------------
+// FileDataLoader
+// -----------------------------------------------------------------------
+
+flexflow_file_data_loader_t
+    flexflow_file_data_loader_create(char const *weight_file_path,
+                                     int num_heads,
+                                     int hidden_dim,
+                                     int qkv_inner_dim);
+
+void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
+
+void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
+                                            flexflow_model_t model_handle_,
+                                            int num_layers,
+                                            char const **layer_names,
+                                            flexflow_op_t *layers,
+                                            bool use_full_precision);
 
 #ifdef __cplusplus
 }

diff --git a/inference/file_loader.cc b/inference/file_loader.cc
@@ -709,7 +709,6 @@ void FileDataLoader::load_weights(
       if (weight == NULL) {
         continue;
       }
-
       switch (weight->data_type) {
         case DT_HALF:
           load_single_weight_tensor<half>(ff, weight, i, v.first);

diff --git a/inference/flexflow_inference.py b/inference/flexflow_inference.py
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
@@ -151,11 +151,6 @@ void FlexFlow::top_level_task(Task const *task,
   RequestManager *rm = RequestManager::get_request_manager();
   rm->register_tokenizer(model_type, file_paths.tokenizer_file_path);
   rm->register_output_filepath(file_paths.output_file_path);
-  // InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS);
-  // RequestManager rm(model_type,
-  //                   file_paths.tokenizer_file_path,
-  //                   /*verbose*/ verbose,
-  //                   file_paths.output_file_path);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
   if (model_type == ModelType::LLAMA) {

diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
@@ -62,11 +62,7 @@ void FALCON::create_falcon_model(FFModel &ff,
   Layer *embedding = ff.layers.back();
   weights_layers.emplace("tok_embeddings_weight", embedding);
 
-  int num_transformer_layers = falcon_config.n_layers;
-  int num_transformer_layers_per_stage =
-      (num_transformer_layers + num_pipeline_stages - 1) / num_pipeline_stages;
-
-  for (int i = 0; i < num_transformer_layers; i++) {
+  for (int i = 0; i < falcon_config.n_layers; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);
     // step 1: attention

diff --git a/inference/models/opt.cc b/inference/models/opt.cc
@@ -119,7 +119,7 @@ void OPT::create_opt_model(FFModel &ff,
             NULL,
             false,
             /*scaling query*/ true,
-            /*sacling factor*/
+            /*scaling factor*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5),
             /*qk_prod_scaling*/ false);
@@ -140,7 +140,7 @@ void OPT::create_opt_model(FFModel &ff,
             NULL,
             false,
             /*scaling query*/ true,
-            /*sacling factor*/
+            /*scaling factor*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5),
             /*qk_prod_scaling*/ false);
@@ -161,7 +161,7 @@ void OPT::create_opt_model(FFModel &ff,
             NULL,
             false,
             /*scaling query*/ true,
-            /*sacling factor*/
+            /*scaling factor*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5),
             /*qk_prod_scaling*/ false);