Merge branch 'inference' into bug_fixes

flexflow · Jan 9, 2024 · d37f5c5 · d37f5c5
2 parents c8d2cd1 + ba4af39
commit d37f5c5
Show file tree

Hide file tree

Showing 8 changed files with 36 additions and 17 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -17,3 +17,5 @@ python/flexflow/core/legion_cffi_header.py
 /inference/tokenizer/*
 /inference/prompt/*
 /inference/output/*
+
+/tests/inference/python_test_configs/*.json
diff --git a/.gitignore b/.gitignore
@@ -186,4 +186,5 @@ gpt_tokenizer
 # pip version
 python/flexflow/version.txt
 
-inference_tensors
+inference_tensors
+tests/inference/python_test_configs/*.json
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -167,9 +167,10 @@ class BeamSearchBatchConfig : public BatchConfig {
     int current_depth = -1;
     int max_depth = MAX_BEAM_DEPTH;
 
-    BatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH];
-    float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH];
-    int parent_id[BeamSearchBatchConfig::MAX_BEAM_WIDTH];
+    BatchConfig::TokenId
+        tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+    float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+    int parent_id[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
     int sub_request_num;
   };
 
@@ -178,10 +179,11 @@ class BeamSearchBatchConfig : public BatchConfig {
   };
 
   BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS];
-  BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH];
+  BeamSearchPerTokenInfo
+      beamTokenInfo[MAX_NUM_TOKENS +
+                    MAX_SPEC_TREE_TOKEN_NUM * MAX_NUM_REQUESTS];
 
-  // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH?
-  int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH];
+  int sub_requests[MAX_NUM_REQUESTS];
 
 private:
   size_t current_iteration;
@@ -190,9 +192,12 @@ class BeamSearchBatchConfig : public BatchConfig {
 struct BeamInferenceResult {
   static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
   BatchConfig::TokenId
-      token_ids[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH];
-  float probs[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH];
-  int parent_id[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH];
+      token_ids[MAX_NUM_TOKENS *
+                BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+  float probs[MAX_NUM_TOKENS *
+              BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+  int parent_id[MAX_NUM_TOKENS *
+                BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -76,7 +76,7 @@ struct BeamTree {
   struct treeLayer {
     BeamSearchBatchConfig::TokenId
         tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH];
+    int parent_ids[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
     float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
     int nodes_num_this_layer = 0;
   };

diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
@@ -56,7 +56,7 @@ def get_c_name(name):
     if name is None:
         return ffi.NULL
     else:
-        return ffi.new("char[]", name.encode("ascii"))
+        return ffi.new("char[]", name.encode("utf-8"))
 
 
 def get_datatype_size(datatype):

diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
@@ -1596,7 +1596,11 @@ flexflow_generation_result_t
   GenerationResult result = handle->generate(prompts, max_seq_length);
   DEBUG_PRINT(
       "[Model] generate %p %s %i", handle, text_str.c_str(), max_seq_length);
-  assert(result.output_tokens.size() <= max_seq_length);
+  // If the prompt exceeds max seq len, check that we return the prompt with no
+  // additional token. Otherwise, check that the output does not exceed the max
+  // sequence length.
+  assert(result.output_tokens.size() <= max_seq_length ||
+         result.output_tokens.size() == result.input_tokens.size());
   output_length_and_tokens[0] = result.output_tokens.size();
   std::copy(result.output_tokens.begin(),
             result.output_tokens.end(),

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
@@ -43,7 +43,8 @@ std::string LoadBytesFromFile(std::string const &path) {
 }
 
 RequestManager::RequestManager()
-    : verbose(false), next_available_guid(1000000), num_processed_requests(0) {
+    : verbose(false), next_available_guid(1000000), num_processed_requests(0),
+      total_request_run_time(0.0f) {
   // The following config parameters are set
   // during ffmodel.compile()
   // Initialize them to -1 to make sure no one
@@ -767,7 +768,9 @@ BeamSearchBatchConfig
                 : 1;
         new_bc.beamRequestsInfo[i].max_depth =
             std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH);
-        for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) {
+        for (int j = 0;
+             j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+             j++) {
           new_bc.beamRequestsInfo[i].parent_id[j] = 0;
           new_bc.beamRequestsInfo[i].probs[j] = 1;
         }
@@ -840,7 +843,8 @@ BeamSearchBatchConfig
               ? spec_infer_tree_width[ssm_decoding_steps]
               : 1;
       new_bc.beamRequestsInfo[i].max_depth = 0;
-      for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) {
+      for (int j = 0; j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+           j++) {
         new_bc.beamRequestsInfo[i].parent_id[j] = 0;
         new_bc.beamRequestsInfo[i].probs[j] = 1;
       }
@@ -900,7 +904,9 @@ BeamSearchBatchConfig
             std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH,
                      get_max_tokens_per_batch() -
                          new_bc.requestsInfo[i].num_tokens_in_batch - 1);
-        for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) {
+        for (int j = 0;
+             j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+             j++) {
           new_bc.beamRequestsInfo[i].parent_id[j] = 0;
           new_bc.beamRequestsInfo[i].probs[j] = 1;
         }

diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh
@@ -6,6 +6,7 @@ set -e
 cd "${BASH_SOURCE[0]%/*}"
 
 # Generate test configs
+rm -rf python_test_configs/*.json
 python python_test_configs/generate_configs.py
 
 # Run all tests