make inc_decoding work

flexflow · Jul 23, 2023 · 8a58aed · 8a58aed
1 parent 781fe21
commit 8a58aed
Show file tree

Hide file tree

Showing 22 changed files with 413 additions and 330 deletions.
diff --git a/examples/cpp/inference/dataloader.cu b/examples/cpp/inference/dataloader.cu
@@ -15,6 +15,7 @@
 
 #include "dataloader.h"
 #include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
 #include "flexflow/utils/cuda_helper.h"
 
 void DataLoader::load_input(Task const *task,

diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc
@@ -15,6 +15,7 @@
 
 #include "moe.h"
 #include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
 #include <cstdlib>
 #include <fstream>
 #include <iostream>

diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc
@@ -15,6 +15,7 @@
 
 #include "transformers.h"
 #include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
 #include <cstdlib>
 #include <fstream>
 #include <iostream>

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -52,7 +52,7 @@ class BatchConfig {
   void print() const;
   virtual InferenceMode get_mode() const;
   static BatchConfig const *from_future(BatchConfigFuture const &future);
-  static int const MAX_NUM_REQUESTS = 16;
+  static int const MAX_NUM_REQUESTS = 1;
   static int const MAX_NUM_TOKENS = 64;
   static int const MAX_SEQ_LENGTH = 256;
 

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
@@ -14,20 +14,10 @@
  */
 
 #pragma once
-
 #include "flexflow/batch_config.h"
-#include "flexflow/model.h"
-#include <future>
-#include <mutex>
-#include <tokenizers_cpp.h>
 
 namespace FlexFlow {
 
-class FFModel;
-class BeamTree;
-class RequestManager;
-using tokenizers::Tokenizer;
-
 struct SamplingConfig {
   bool do_sample = false;
   float temperature = 0.8;
@@ -50,210 +40,4 @@ struct GenerationResult {
   std::vector<TokenId> output_tokens;
 };
 
-class InferenceManager {
-public:
-  InferenceManager(FFConfig const &config, int max_num_tokens_per_batch);
-  static InferenceManager *get_inference_manager();
-  void compile_model_and_allocate_buffer(FFModel *model);
-  void init_operators_inference(FFModel *model);
-  MachineView *get_machine_view(int mv_id);
-  Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc);
-  Legion::FutureMap
-      inference(FFModel *model, int index, BatchConfigFuture const &bc);
-  void load_input_tokens_from_batch_config(BatchConfigFuture const &bc,
-                                           ParallelTensor const input);
-  void load_positions(BatchConfigFuture const &bc,
-                      ParallelTensor position_input);
-  void incr_decoding_loop(FFModel *model,
-                          RequestManager &rm,
-                          int total_num_requests);
-  void spec_inference_loop(FFModel *model,
-                           RequestManager &rm,
-                           int total_num_requests,
-                           std::vector<int> ssm_model_ids);
-
-public:
-  FFConfig ff_config;
-  std::unordered_map<ParallelTensor, std::vector<ParallelTensor>> tensor_buffer;
-  int max_num_tokens_per_batch;
-  int num_devices;
-  std::vector<MachineView> machine_views;
-};
-
-struct Request {
-  BatchConfig::RequestGuid guid;
-  int max_sequence_length;
-  int initial_len;
-  std::vector<BatchConfig::TokenId> tokens;
-
-  std::vector<struct BeamTree> beam_trees;
-  std::promise<GenerationResult> *promise;
-};
-
-// store the result of beam search
-struct BeamTree {
-  struct treeLayer {
-    BeamSearchBatchConfig::TokenId
-        tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH];
-    int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH];
-    float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH];
-  };
-  treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1];
-};
-
-// struct BeamTree_v2 {
-//   std::vector<BatchConfig::TokenId> tokens;
-//   std::vector<int> parent_ids;
-//   std::vector<float> probs;
-// };
-
-class RequestManager {
-public:
-  using RequestGuid = BatchConfig::RequestGuid;
-  using TokenId = BatchConfig::TokenId;
-  RequestManager(ModelType model_type,
-                 std::string const &path,
-                 bool verbose = false,
-                 std::string output_filepath = "");
-  RequestManager();
-  static RequestManager *get_request_manager();
-  size_t get_num_processed_requests();
-
-  int register_new_model(FFModel *model);
-  void register_tokenizer(ModelType model_type, std::string const &path);
-  void register_output_filepath(std::string const &);
-
-  FFModel *get_model(int model_id);
-  void serve(FFModel *model);
-
-  static GenerationResult generate(std::string const &text, int max_seq_length);
-  RequestGuid register_new_request(std::string const &prompt,
-                                   int max_sequence_length);
-  RequestGuid register_new_request(std::vector<TokenId> const &prompt,
-                                   int max_sequence_length);
-  BatchConfig prepare_next_batch(BatchConfig const &bc,
-                                 InferenceResult const &result);
-  BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
-                                       InferenceResultFuture const &result);
-  BeamSearchBatchConfig
-      prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
-                              BeamInferenceResult const &result);
-  BeamSearchBatchConfigFuture
-      prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc,
-                              BeamInferenceResultFuture const &result);
-  BeamSearchBatchConfig
-      prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
-                              InferenceResult const &result,
-                              int model_id);
-  BeamSearchBatchConfigFuture
-      prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc,
-                              InferenceResultFuture const &result,
-                              int model_id);
-  TreeVerifyBatchConfig prepare_next_batch_verify(
-      std::vector<BeamSearchBatchConfig> const &old_batches);
-  TreeVerifyBatchConfigFuture prepare_next_batch_verify(
-      std::vector<BeamSearchBatchConfigFuture> const &old_batches);
-
-  void store_beam_metadata(BeamSearchBatchConfig const &old_bc,
-                           BeamInferenceResult const &result);
-  void update_beam_metadata(BeamSearchBatchConfig &new_bc,
-                            BeamTree &tree,
-                            int request_index);
-
-  std::vector<std::pair<BatchConfig::TokenId, int>>
-      traverse_beam_tree(BeamSearchBatchConfig const &old_bc,
-                         int request_index,
-                         int token_start_offset);
-
-  // remove guid after put the cached tree in request
-  std::vector<std::pair<BatchConfig::TokenId, int>> merge_dfs_trees(
-      std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
-          input_trees,
-      int root_depth,
-      RequestGuid guid);
-
-  std::vector<std::pair<BatchConfig::TokenId, int>> traverse_verify_tree(
-      size_t guid,
-      std::vector<std::pair<BatchConfig::TokenId, int>> const
-          &inputSerializedTree,
-      std::vector<std::pair<BatchConfig::TokenId, int>> const
-          &outputSerializedTree);
-
-  static void
-      load_tokens_task(Legion::Task const *task,
-                       std::vector<Legion::PhysicalRegion> const &regions,
-                       Legion::Context ctx,
-                       Legion::Runtime *runtime);
-  static void
-      load_positions_task(Legion::Task const *task,
-                          std::vector<Legion::PhysicalRegion> const &regions,
-                          Legion::Context ctx,
-                          Legion::Runtime *runtime);
-
-  static BatchConfig prepare_next_batch_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  static BeamSearchBatchConfig prepare_next_batch_beam_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  static BeamSearchBatchConfig prepare_next_batch_init_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  static TreeVerifyBatchConfig prepare_next_batch_verify_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  static void llm_serving_background_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-private:
-  std::unique_ptr<Tokenizer> tokenizer_;
-  bool verbose;
-  ModelType model_type;
-  std::string output_filepath;
-  std::queue<Request> pending_request_queue;
-  std::unordered_map<RequestGuid, Request> all_requests;
-  std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
-  std::mutex request_queue_mutex;
-  RequestGuid next_available_guid;
-  const std::map<ModelType, int> model_bos_map = {{ModelType::LLAMA, 0},
-                                                  {ModelType::OPT, 2}};
-
-  // TODO: Move this two vector to request struct
-  std::unordered_map<RequestGuid,
-                     std::vector<std::pair<BatchConfig::TokenId, int>>>
-      dfs_tree_inputs;
-  std::unordered_map<RequestGuid, std::vector<std::pair<int, int>>>
-      committed_tokens;
-
-  // Multi-model support
-  int num_ssms;
-  std::vector<FFModel *> models;
-
-  // Performance profiling
-  size_t num_processed_requests;
-
-private:
-  struct ProfileInfo {
-    int decoding_steps;
-    double start_time, finish_time;
-  };
-  std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
-  double total_request_run_time;
-};
-
 } // namespace FlexFlow
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -17,6 +17,7 @@
 #include "accessor.h"
 #include "config.h"
 #include "device.h"
+#include "flexflow/inference.h"
 #include "flexflow/memory_optimization.h"
 #include "flexflow/node.h"
 #include "flexflow/operator_params.h"
@@ -698,6 +699,10 @@ class FFModel {
       float scaling_factor = 1.0f,
       bool qk_prod_scaling = true,
       char const *name = NULL);
+  // ========================================
+  // Inference APIs
+  // ========================================
+  GenerationResult generate(std::string const &text, int max_seq_length);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H
 #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H
 
+#include "flexflow/accessor.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/inference.h"

diff --git a/include/flexflow/ops/inc_multiquery_self_attention.h b/include/flexflow/ops/inc_multiquery_self_attention.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_INC_MULTIQUERY_ATTENTION_H
 #define _FLEXFLOW_INC_MULTIQUERY_ATTENTION_H
 
+#include "flexflow/accessor.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/inference.h"

diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_H
 #define _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_H
 
+#include "flexflow/accessor.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/inference.h"

diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H
 #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H
 
+#include "flexflow/accessor.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/inference.h"
@@ -9,6 +10,7 @@
 #include "flexflow/op_meta.h"
 #include "flexflow/operator.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
+#include "flexflow/ops/tree_inc_multihead_self_attention_params.h"
 #include "math.h"
 #include <cfloat>
 #include <complex>