fix

flexflow · Oct 15, 2024 · e56aede · e56aede
1 parent eeafdc7
commit e56aede
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 19 deletions.
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
@@ -51,12 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 8 * 1024, # 8GB
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "enable_peft": False,
-            "peft_activation_reserve_space_size": 1024, # 1GB
-            "peft_weight_reserve_space_size": 1024, # 1GB
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
@@ -71,6 +71,7 @@ def get_configs():
             "full_precision": False,
             "prompt": "",
             "output_file": "",
+            "max_length": 128,
         }
         # Merge dictionaries
         ff_init_configs.update(llm_configs)
@@ -106,9 +107,9 @@ def main():
         max_seq_length=256,
         max_tokens_per_batch=64,
     )
-    
+
     llm.start_server()
-    
+
     if len(configs.prompt) > 0:
         prompts = [s for s in json.load(open(configs.prompt))]
         if "max_length" not in configs_dict:
@@ -119,8 +120,10 @@ def main():
         if "max_length" not in configs_dict:
             result = llm.generate("Three tips for staying healthy are: ")
         else:
-            result = llm.generate("Three tips for staying healthy are: ", max_length=configs.max_length)
-
+            result = llm.generate(
+                "Three tips for staying healthy are: ", max_length=configs.max_length
+            )
+
     llm.stop_server()
 
 

diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
@@ -51,12 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 8 * 1024, # 8GB
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "enable_peft": False,
-            "peft_activation_reserve_space_size": 1024, # 1GB
-            "peft_weight_reserve_space_size": 1024, # 1GB
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
@@ -81,6 +81,7 @@ def get_configs():
             ],
             "prompt": "",
             "output_file": "",
+            "max_length": 128,
         }
         # Merge dictionaries
         ff_init_configs.update(llm_configs)
@@ -144,17 +145,26 @@ def main():
         max_tokens_per_batch=64,
         ssms=ssms,
     )
-    
+
     llm.start_server()
 
     if len(configs.prompt) > 0:
         prompts = [s for s in json.load(open(configs.prompt))]
-        results = llm.generate(prompts)
+        if "max_length" not in configs_dict:
+            results = llm.generate(prompts)
+        else:
+            results = llm.generate(prompts, max_length=configs.max_length)
     else:
-        result = llm.generate("Three tips for staying healthy are: ")
-
+        if "max_length" not in configs_dict:
+            result = llm.generate("Three tips for staying healthy are: ")
+        else:
+            result = llm.generate(
+                "Three tips for staying healthy are: ", max_length=configs.max_length
+            )
+
     llm.stop_server()
 
+
 if __name__ == "__main__":
     print("flexflow inference example (speculative inference)")
     main()
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
@@ -170,7 +170,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
 
-  std::cout << "spec create operator: " << layer->name << "\n";
+  // std::cout << "spec create operator: " << layer->name << "\n";
   long long value;
   layer->get_int_property("embed_dim", value);
   int embed_dim = value;
@@ -182,10 +182,10 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
   int kdim = value;
   layer->get_int_property("vdim", value);
   int vdim = value;
-  float dropout;
-  layer->get_float_property("dropout", dropout);
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  float dropout;
+  layer->get_float_property("dropout", dropout);
   RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
   rotary_embedding_meta.apply_rotary_embedding = (bool)value;

diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
@@ -163,6 +163,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
                        rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
+  li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_int_property("position_bias", position_bias);
   li->add_int_property("quantization_type", quantization_type);
   li->add_int_property("offload", offload);
@@ -187,10 +188,10 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
   int kdim = value;
   layer->get_int_property("vdim", value);
   int vdim = value;
-  float dropout;
-  layer->get_float_property("dropout", dropout);
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  float dropout;
+  layer->get_float_property("dropout", dropout);
   RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
   rotary_embedding_meta.apply_rotary_embedding = (bool)value;
@@ -203,6 +204,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
                             rotary_embedding_meta.high_freq_factor);
   layer->get_int_property("original_max_position_embeddings", value);
   rotary_embedding_meta.original_max_position_embeddings = (int)value;
+  layer->get_int_property("scaling_query", value);
   bool scaling_query = (bool)value;
   float scaling_factor;
   layer->get_float_property("scaling_factor", scaling_factor);