compiled

flexflow · Jul 23, 2023 · 781fe21 · 781fe21
1 parent bff1b54
commit 781fe21
Show file tree

Hide file tree

Showing 14 changed files with 252 additions and 123 deletions.
diff --git a/config/config.linux b/config/config.linux
@@ -75,7 +75,7 @@ FF_USE_AVX2=${FF_USE_AVX2:-OFF}
 FF_MAX_DIM=${FF_MAX_DIM:-5}
 
 # set LEGION_MAX_RETURN_SIZE
-LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-131072}
+LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-262144}
 
 # set ROCM path
 ROCM_PATH=${ROCM_PATH:-"/opt/rocm"}

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
@@ -17,6 +17,7 @@
 
 #include "flexflow/batch_config.h"
 #include "flexflow/model.h"
+#include <future>
 #include <mutex>
 #include <tokenizers_cpp.h>
 
@@ -27,9 +28,32 @@ class BeamTree;
 class RequestManager;
 using tokenizers::Tokenizer;
 
+struct SamplingConfig {
+  bool do_sample = false;
+  float temperature = 0.8;
+  float topp = 0.6;
+  SamplingConfig(bool _do_sample, float _temperature, float _topp) {
+    temperature = _temperature > 0 ? _temperature : temperature;
+    topp = _topp > 0 ? _topp : topp;
+    do_sample = _do_sample;
+  }
+  SamplingConfig() {}
+};
+
+struct GenerationResult {
+  using RequestGuid = BatchConfig::RequestGuid;
+  using TokenId = BatchConfig::TokenId;
+  RequestGuid guid;
+  std::string input_text;
+  std::string output_text;
+  std::vector<TokenId> input_tokens;
+  std::vector<TokenId> output_tokens;
+};
+
 class InferenceManager {
 public:
   InferenceManager(FFConfig const &config, int max_num_tokens_per_batch);
+  static InferenceManager *get_inference_manager();
   void compile_model_and_allocate_buffer(FFModel *model);
   void init_operators_inference(FFModel *model);
   MachineView *get_machine_view(int mv_id);
@@ -47,6 +71,7 @@ class InferenceManager {
                            RequestManager &rm,
                            int total_num_requests,
                            std::vector<int> ssm_model_ids);
+
 public:
   FFConfig ff_config;
   std::unordered_map<ParallelTensor, std::vector<ParallelTensor>> tensor_buffer;
@@ -62,6 +87,7 @@ struct Request {
   std::vector<BatchConfig::TokenId> tokens;
 
   std::vector<struct BeamTree> beam_trees;
+  std::promise<GenerationResult> *promise;
 };
 
 // store the result of beam search
@@ -75,18 +101,6 @@ struct BeamTree {
   treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1];
 };
 
-struct SamplingConfig {
-  bool do_sample = false;
-  float temperature = 0.8;
-  float topp = 0.6;
-  SamplingConfig(bool _do_sample, float _temperature, float _topp) {
-    temperature = _temperature > 0 ? _temperature : temperature;
-    topp = _topp > 0 ? _topp : topp;
-    do_sample = _do_sample;
-  }
-  SamplingConfig() {}
-};
-
 // struct BeamTree_v2 {
 //   std::vector<BatchConfig::TokenId> tokens;
 //   std::vector<int> parent_ids;
@@ -102,12 +116,17 @@ class RequestManager {
                  bool verbose = false,
                  std::string output_filepath = "");
   RequestManager();
+  static RequestManager *get_request_manager();
   size_t get_num_processed_requests();
 
   int register_new_model(FFModel *model);
+  void register_tokenizer(ModelType model_type, std::string const &path);
+  void register_output_filepath(std::string const &);
 
   FFModel *get_model(int model_id);
+  void serve(FFModel *model);
 
+  static GenerationResult generate(std::string const &text, int max_seq_length);
   RequestGuid register_new_request(std::string const &prompt,
                                    int max_sequence_length);
   RequestGuid register_new_request(std::vector<TokenId> const &prompt,
@@ -195,13 +214,20 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
+  static void llm_serving_background_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+
 private:
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
   ModelType model_type;
   std::string output_filepath;
   std::queue<Request> pending_request_queue;
-  std::unordered_map<RequestGuid, Request> running_request_queue;
+  std::unordered_map<RequestGuid, Request> all_requests;
+  std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
   std::mutex request_queue_mutex;
   RequestGuid next_available_guid;
   const std::map<ModelType, int> model_bos_map = {{ModelType::LLAMA, 0},

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -230,6 +230,7 @@ enum TaskIDs {
   RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
   RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
+  RM_LLM_SERVING_BACKGROUND_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
   CUSTOM_GPU_TASK_ID_1,

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
@@ -40,10 +40,7 @@ void parse_input_args(char **argv,
                       bool &verbose,
                       bool &do_sample,
                       float &temperature,
-                      float &topp,
-                      int &data_parallelism_degree,
-                      int &tensor_parallelism_degree,
-                      int &pipeline_parallelism_degree) {
+                      float &topp) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -88,21 +85,6 @@ void parse_input_args(char **argv,
       paths.output_file_path = std::string(argv[++i]);
       continue;
     }
-    // data parallelism degree
-    if (!strcmp(argv[i], "-data-parallelism-degree")) {
-      data_parallelism_degree = std::stoi(argv[++i]);
-      continue;
-    }
-    // tensor parallelism degree
-    if (!strcmp(argv[i], "-tensor-parallelism-degree")) {
-      tensor_parallelism_degree = std::stoi(argv[++i]);
-      continue;
-    }
-    // pipeline parallelism degree
-    if (!strcmp(argv[i], "-pipeline-parallelism-degree")) {
-      pipeline_parallelism_degree = std::stoi(argv[++i]);
-      continue;
-    }
     if (!strcmp(argv[i], "--use-full-precision")) {
       use_full_precision = true;
       continue;
@@ -143,8 +125,6 @@ void FlexFlow::top_level_task(Task const *task,
   float temperature = 0.0f;
   float topp = 0.0f;
   size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes;
-  int data_parallelism_degree = 1, tensor_parallelism_degree = 1,
-      pipeline_parallelism_degree = 1;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -157,47 +137,41 @@ void FlexFlow::top_level_task(Task const *task,
                    verbose,
                    do_sample,
                    temperature,
-                   topp,
-                   data_parallelism_degree,
-                   tensor_parallelism_degree,
-                   pipeline_parallelism_degree);
-  ffconfig.data_parallelism_degree = data_parallelism_degree;
-  ffconfig.tensor_parallelism_degree = tensor_parallelism_degree;
-  ffconfig.pipeline_parallelism_degree = pipeline_parallelism_degree;
+                   topp);
 
-  assert(data_parallelism_degree * tensor_parallelism_degree *
-             pipeline_parallelism_degree ==
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
 
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
 
   SamplingConfig samplingConfig(do_sample, temperature, topp);
-  InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS);
-  RequestManager rm(model_type,
-                    file_paths.tokenizer_file_path,
-                    /*verbose*/ verbose,
-                    file_paths.output_file_path);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->register_tokenizer(model_type, file_paths.tokenizer_file_path);
+  rm->register_output_filepath(file_paths.output_file_path);
+  // InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS);
+  // RequestManager rm(model_type,
+  //                   file_paths.tokenizer_file_path,
+  //                   /*verbose*/ verbose,
+  //                   file_paths.output_file_path);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
   if (model_type == ModelType::LLAMA) {
     LLAMA::create_llama_model(model,
-                              im,
                               file_paths.llm_config_file_path,
                               file_paths.llm_weight_file_path,
                               INC_DECODING_MODE,
                               samplingConfig,
                               use_full_precision);
   } else if (model_type == ModelType::OPT) {
     OPT::create_opt_model(model,
-                          im,
                           file_paths.llm_config_file_path,
                           file_paths.llm_weight_file_path,
                           INC_DECODING_MODE,
                           use_full_precision);
   } else if (model_type == ModelType::FALCON) {
     FALCON::create_falcon_model(model,
-                                im,
                                 file_paths.llm_config_file_path,
                                 file_paths.llm_weight_file_path,
                                 ffconfig.workersPerNode * ffconfig.numNodes,
@@ -220,27 +194,10 @@ void FlexFlow::top_level_task(Task const *task,
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
-      rm.register_new_request(text, 128 /*max_sequence_length*/);
-    }
-  }
-
-  BatchConfig bc;
-  InferenceResult ir;
-  BatchConfigFuture bcf = Future::from_value<BatchConfig>(bc);
-  InferenceResultFuture irf = Future::from_value<InferenceResult>(ir);
-  while (rm.get_num_processed_requests() < total_num_requests) {
-    // bc = rm.prepare_next_batch(bc, ir);
-    bcf = rm.prepare_next_batch(bcf, irf);
-    if (rm.get_num_processed_requests() >= total_num_requests) {
-      break;
+      GenerationResult result =
+          RequestManager::generate(text, 128 /*max_sequence_length*/);
     }
-    FutureMap fm = im.inference(&model, 0, bcf);
-    assert(fm.get_future_map_domain().get_volume() == 1);
-    // Future future = fm.get_future(0);
-    // ir = future.get_result<InferenceResult>();
-    irf = fm.get_future(0);
   }
-  // im.incr_decoding_loop(&model, rm, total_num_requests);
 
   // Execution fence
   {

diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
@@ -20,7 +20,6 @@ namespace FlexFlow {
 using namespace Legion;
 
 void FALCON::create_falcon_model(FFModel &ff,
-                                 InferenceManager &im,
                                  std::string const &model_config_file_path,
                                  std::string const &weight_file_path,
                                  int num_pipeline_stages,
@@ -141,7 +140,8 @@ void FALCON::create_falcon_model(FFModel &ff,
 
   // Compile the model
   std::cout << "------start compile ----------" << std::endl;
-  im.compile_model_and_allocate_buffer(&ff);
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  im->compile_model_and_allocate_buffer(&ff);
   FileDataLoader fileloader("",
                             weight_file_path,
                             falcon_config.n_heads,
@@ -151,7 +151,7 @@ void FALCON::create_falcon_model(FFModel &ff,
   std::cout << "------load weight finished----------" << std::endl;
 
   // init operators
-  im.init_operators_inference(&ff);
+  im->init_operators_inference(&ff);
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/falcon.h b/inference/models/falcon.h
@@ -104,7 +104,6 @@ class FALCON {
   };
 
   static void create_falcon_model(FFModel &ff,
-                                  InferenceManager &im,
                                   std::string const &model_config_file_path,
                                   std::string const &weight_file_path,
                                   int num_pipeline_stages,

diff --git a/inference/models/llama.cc b/inference/models/llama.cc
@@ -20,7 +20,6 @@ namespace FlexFlow {
 using namespace Legion;
 
 void LLAMA::create_llama_model(FFModel &ff,
-                               InferenceManager &im,
                                std::string const &model_config_file_path,
                                std::string const &weight_file_path,
                                InferenceMode mode,
@@ -188,9 +187,10 @@ void LLAMA::create_llama_model(FFModel &ff,
     }
   }
 
+  InferenceManager *im = InferenceManager::get_inference_manager();
   // Compile the model
   std::cout << "------start compile ----------" << std::endl;
-  im.compile_model_and_allocate_buffer(&ff);
+  im->compile_model_and_allocate_buffer(&ff);
   FileDataLoader fileloader("",
                             weight_file_path,
                             llama_config.n_heads,
@@ -200,7 +200,7 @@ void LLAMA::create_llama_model(FFModel &ff,
   std::cout << "------load weight finished----------" << std::endl;
 
   // init operators
-  im.init_operators_inference(&ff);
+  im->init_operators_inference(&ff);
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/llama.h b/inference/models/llama.h
@@ -103,7 +103,6 @@ class LLAMA {
   };
 
   static void create_llama_model(FFModel &ff,
-                                 InferenceManager &im,
                                  std::string const &model_config_file_path,
                                  std::string const &weight_file_path,
                                  InferenceMode mode,

diff --git a/inference/models/opt.cc b/inference/models/opt.cc
@@ -20,7 +20,6 @@ namespace FlexFlow {
 using namespace Legion;
 
 void OPT::create_opt_model(FFModel &ff,
-                           InferenceManager &im,
                            std::string const &model_config_file_path,
                            std::string const &weight_file_path,
                            InferenceMode mode,
@@ -222,7 +221,8 @@ void OPT::create_opt_model(FFModel &ff,
 
   //------------------- compile the model --------------------------------
   std::cout << "------start compile ----------" << std::endl;
-  im.compile_model_and_allocate_buffer(&ff);
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  im->compile_model_and_allocate_buffer(&ff);
   FileDataLoader fileloader("",
                             weight_file_path,
                             opt_config.num_attention_heads,
@@ -231,7 +231,7 @@ void OPT::create_opt_model(FFModel &ff,
                                 opt_config.num_attention_heads);
   fileloader.load_weights(&ff, weights_layers, use_full_precision);
   std::cout << "------finished loading weights----------" << std::endl;
-  im.init_operators_inference(&ff);
+  im->init_operators_inference(&ff);
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/opt.h b/inference/models/opt.h
@@ -105,7 +105,6 @@ class OPT {
   };
 
   static void create_opt_model(FFModel &ff,
-                               InferenceManager &im,
                                std::string const &model_config_file_path,
                                std::string const &weight_file_path,
                                InferenceMode mode,