flexflow · jiazhihao · Sep 25, 2023 · Aug 18, 2023 · Aug 18, 2023 · Aug 22, 2023
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -46,13 +46,14 @@ class BatchConfig {
   void print() const;
   virtual InferenceMode get_mode() const;
   static BatchConfig const *from_future(BatchConfigFuture const &future);
-  static int const MAX_NUM_REQUESTS = 1;
+  static int const MAX_NUM_REQUESTS = 4;
   static int const MAX_NUM_TOKENS = 64;
   static int const MAX_PROMPT_LENGTH = 62;
   static int const MAX_SEQ_LENGTH = 256;
 
   //  These are set by update
   int num_tokens;
+  bool loading_prompt = false;
 
   struct PerRequestInfo {
     int token_start_offset;
@@ -69,6 +70,7 @@ class BatchConfig {
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
 
   bool request_completed[MAX_NUM_REQUESTS];
+  bool request_running[MAX_NUM_TOKENS];
 };
 
 class TreeVerifyBatchConfig : public BatchConfig {
@@ -113,7 +115,6 @@ class BeamSearchBatchConfig : public BatchConfig {
   inline static int const MAX_BEAM_DEPTH = 8;
 
   int model_id;
-  int max_init_length = 0;
 
   struct BeamSearchPerRequestInfo {
     int beam_size;

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -239,8 +239,8 @@ enum TaskIDs {
   RM_LOAD_TOKENS_TASK_ID,
   RM_LOAD_POSITION_TASK_ID,
   RM_PREPARE_NEXT_BATCH_TASK_ID,
-  RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
   RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
+  RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
@@ -787,7 +787,8 @@ class FFModel {
   // ========================================
   // Inference APIs
   // ========================================
-  GenerationResult generate(std::string const &text, int max_seq_length);
+  GenerationResult generate(std::vector<std::string> &prompts,
+                            int max_seq_length);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],

diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -15,8 +15,10 @@ class SoftmaxMeta : public OpMeta {
               Legion::Domain const &input_domain);
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnTensorDescriptor_t inputTensor;
+  cudnnTensorDescriptor_t outputTensor;
 #else
   miopenTensorDescriptor_t inputTensor;
+  miopenTensorDescriptor_t outputTensor;
 #endif
   bool profiling;
   int dim;

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -52,13 +52,17 @@ class InferenceManager {
 
 struct Request {
   enum Status {
-    PENDING = 101,
-    RUNNING = 102,
-    COMPLETED = 103,
+    PENDING = 101,   // loading prompt
+    RUNNING = 102,   // running inference
+    COMPLETED = 103, // finished and verified
+    FINISHING = 104, // finishing request, but not yet verified
   };
   BatchConfig::RequestGuid guid;
   int max_sequence_length;
   int initial_len;
+  int ssm_cache_size = 0;
+  int llm_cache_size = 0;
+
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
@@ -102,10 +106,10 @@ class RequestManager {
   FFModel *get_model(int model_id);
 
   GenerationResult generate_incr_decoding(FFModel *model,
-                                          std::string const &text,
+                                          std::vector<std::string> &prompts,
                                           int max_seq_length);
   GenerationResult generate_spec_infer(FFModel *model,
-                                       std::string const &text,
+                                       std::vector<std::string> &prompts,
                                        int max_seq_length);
   GenerationResult get_generation_result(RequestGuid const &guid);
   RequestGuid register_new_request(std::string const &prompt,

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
@@ -242,13 +242,15 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
+    std::vector<std::string> prompts;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
-      GenerationResult result =
-          model.generate(text, 128 /*max_sequence_length*/);
+      prompts.push_back(text);
     }
+    GenerationResult result =
+        model.generate(prompts, 128 /*max_sequence_length*/);
   }
 
   // Execution fence

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
@@ -384,12 +384,16 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
+
+    std::vector<std::string> prompts;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
-      tree_model.generate(text, 128 /*max_sequence_length*/);
+      prompts.push_back(text);
+      // tree_model.generate(text, 128 /*max_sequence_length*/);
     }
+    tree_model.generate(prompts, 128 /*max_sequence_length*/);
   }
 
   // Execution fence

diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
@@ -1529,8 +1529,10 @@ flexflow_generation_result_t
                             int max_seq_length,
                             int *output_length_and_tokens) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+  std::vector<std::string> prompts;
   std::string const text_str(input_text);
-  GenerationResult result = handle->generate(text_str, max_seq_length);
+  prompts.push_back(input_text);
+  GenerationResult result = handle->generate(prompts, max_seq_length);
   DEBUG_PRINT("[Model] generate %p %s %i", handle, text_str, max_seq_length);
   assert(result.output_tokens.size() <= max_seq_length);
   output_length_and_tokens[0] = result.output_tokens.size();

diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
@@ -284,8 +284,8 @@ void FFMapper::select_task_options(const MapperContext ctx,
     return;
   }
   if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) ||
-      (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) ||
+      (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID)) {
     output.initial_proc = all_cpus[0];
     return;

diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp
@@ -393,7 +393,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
 
   if (m->beam_search) {
     // set all parents id zero in arg top1 case.
-    checkCUDA(hipMemset(parent, 0, batch_size * sizeof(int)));
+    checkCUDA(hipMemsetAsync(parent, 0, batch_size * sizeof(int), stream));
   }
   int num_shards = 0;
   int k = 1;

diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
@@ -59,7 +59,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
   DT alpha = 1.0f, beta = 0.0f;
   if (m->beam_search) {
     // set all parents id zero in arg top1 case.
-    checkCUDA(cudaMemset(parent, 0, batch_size * sizeof(int)));
+    checkCUDA(cudaMemsetAsync(parent, 0, batch_size * sizeof(int), stream));
   }
   size_t temp_storage_bytes = m->temp_storage_bytes;
   // use cub
@@ -83,6 +83,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
                           prob_ptr,
                           batch_size,
                           m->beam_search);
+  // print_tensor<int>(indices_ptr, 32, "argmax op");
 }
 
 /*static*/
@@ -93,7 +94,6 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                     int batch_size) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
     cudaEventCreate(&t_start);

diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp
@@ -29,6 +29,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(
       cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain));
+  checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
+  checkCUDNN(
+      cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain));
   dim = softmax->dim;
   profiling = softmax->profiling;
   std::strcpy(op_name, softmax->name);
@@ -127,7 +130,7 @@ void forward_kernel(SoftmaxMeta const *m,
                                      m->inputTensor,
                                      input_ptr,
                                      &beta,
-                                     m->inputTensor,
+                                     m->outputTensor,
                                      output_ptr,
                                      MIOPEN_SOFTMAX_ACCURATE,
                                      MIOPEN_SOFTMAX_MODE_CHANNEL));

diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
@@ -28,6 +28,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
       inputTensor, input_domain, softmax->data_type));
+  checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
+  checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
+      outputTensor, input_domain, softmax->data_type));
   dim = softmax->dim;
   profiling = softmax->profiling;
   std::strcpy(op_name, softmax->name);
@@ -42,7 +45,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
                             DT *output_ptr) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
     cudaEventCreate(&t_start);
@@ -127,7 +129,7 @@ void forward_kernel(SoftmaxMeta const *m,
                                  m->inputTensor,
                                  input_ptr,
                                  &beta,
-                                 m->inputTensor,
+                                 m->outputTensor,
                                  output_ptr));
 }
 

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
@@ -251,6 +251,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
     if (bc->request_completed[i]) {
       continue;
     }
+
     for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) {
 
       // int num_new_tokens = bc->num_processing_tokens[i];
@@ -259,6 +260,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
       int total_tokens = bc->requestsInfo[i].token_start_offset +
                          bc->requestsInfo[i].num_tokens_in_batch;
+
+      if (num_new_tokens <= 0) {
+        continue;
+      }
+
       // Compute (QK^T/sqrt(d_k))
       int m_ = num_new_tokens;
       int n = total_tokens;
@@ -543,7 +549,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
         output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize);
   }
 
-  assert(tokens_previous_requests == num_tokens);
+  // assert(tokens_previous_requests == num_tokens);
 }
 
 template <typename DT>