diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 1df5a05a8f..232ef1699c 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -51,12 +51,12 @@ def get_configs(): "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 2, "offload": False, - "offload_reserve_space_size": 8 * 1024, # 8GB + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, "enable_peft": False, - "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, @@ -71,6 +71,7 @@ def get_configs(): "full_precision": False, "prompt": "", "output_file": "", + "max_length": 128, } # Merge dictionaries ff_init_configs.update(llm_configs) @@ -106,9 +107,9 @@ def main(): max_seq_length=256, max_tokens_per_batch=64, ) - + llm.start_server() - + if len(configs.prompt) > 0: prompts = [s for s in json.load(open(configs.prompt))] if "max_length" not in configs_dict: @@ -119,8 +120,10 @@ def main(): if "max_length" not in configs_dict: result = llm.generate("Three tips for staying healthy are: ") else: - result = llm.generate("Three tips for staying healthy are: ", max_length=configs.max_length) - + result = llm.generate( + "Three tips for staying healthy are: ", max_length=configs.max_length + ) + llm.stop_server() diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 39529abda3..7ae752cffc 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -51,12 +51,12 @@ def get_configs(): "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 2, "offload": False, - "offload_reserve_space_size": 8 * 1024, # 8GB + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, "enable_peft": False, - "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, @@ -81,6 +81,7 @@ def get_configs(): ], "prompt": "", "output_file": "", + "max_length": 128, } # Merge dictionaries ff_init_configs.update(llm_configs) @@ -144,17 +145,26 @@ def main(): max_tokens_per_batch=64, ssms=ssms, ) - + llm.start_server() if len(configs.prompt) > 0: prompts = [s for s in json.load(open(configs.prompt))] - results = llm.generate(prompts) + if "max_length" not in configs_dict: + results = llm.generate(prompts) + else: + results = llm.generate(prompts, max_length=configs.max_length) else: - result = llm.generate("Three tips for staying healthy are: ") - + if "max_length" not in configs_dict: + result = llm.generate("Three tips for staying healthy are: ") + else: + result = llm.generate( + "Three tips for staying healthy are: ", max_length=configs.max_length + ) + llm.stop_server() + if __name__ == "__main__": print("flexflow inference example (speculative inference)") main() diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index aa74ecc6f5..6b2a4be507 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -170,7 +170,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( Layer const *layer, std::vector const &inputs) { - std::cout << "spec create operator: " << layer->name << "\n"; + // std::cout << "spec create operator: " << layer->name << "\n"; long long value; layer->get_int_property("embed_dim", value); int embed_dim = value; @@ -182,10 +182,10 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( int kdim = value; layer->get_int_property("vdim", value); int vdim = value; - float dropout; - layer->get_float_property("dropout", dropout); layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + float dropout; + layer->get_float_property("dropout", dropout); RotaryEmbeddingMeta rotary_embedding_meta; layer->get_int_property("apply_rotary_embedding", value); rotary_embedding_meta.apply_rotary_embedding = (bool)value; diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index ae0795ac1e..ac0011d9eb 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -163,6 +163,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( rotary_embedding_meta.original_max_position_embeddings); li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); + li->add_int_property("qk_prod_scaling", qk_prod_scaling); li->add_int_property("position_bias", position_bias); li->add_int_property("quantization_type", quantization_type); li->add_int_property("offload", offload); @@ -187,10 +188,10 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( int kdim = value; layer->get_int_property("vdim", value); int vdim = value; - float dropout; - layer->get_float_property("dropout", dropout); layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + float dropout; + layer->get_float_property("dropout", dropout); RotaryEmbeddingMeta rotary_embedding_meta; layer->get_int_property("apply_rotary_embedding", value); rotary_embedding_meta.apply_rotary_embedding = (bool)value; @@ -203,6 +204,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( rotary_embedding_meta.high_freq_factor); layer->get_int_property("original_max_position_embeddings", value); rotary_embedding_meta.original_max_position_embeddings = (int)value; + layer->get_int_property("scaling_query", value); bool scaling_query = (bool)value; float scaling_factor; layer->get_float_property("scaling_factor", scaling_factor);