Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Oct 15, 2024
1 parent eeafdc7 commit e56aede
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 19 deletions.
17 changes: 10 additions & 7 deletions inference/python/incr_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ def get_configs():
"tensor_parallelism_degree": 1,
"pipeline_parallelism_degree": 2,
"offload": False,
"offload_reserve_space_size": 8 * 1024, # 8GB
"offload_reserve_space_size": 8 * 1024, # 8GB
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"enable_peft": False,
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"profiling": False,
"benchmarking": False,
"inference_debugging": False,
Expand All @@ -71,6 +71,7 @@ def get_configs():
"full_precision": False,
"prompt": "",
"output_file": "",
"max_length": 128,
}
# Merge dictionaries
ff_init_configs.update(llm_configs)
Expand Down Expand Up @@ -106,9 +107,9 @@ def main():
max_seq_length=256,
max_tokens_per_batch=64,
)

llm.start_server()

if len(configs.prompt) > 0:
prompts = [s for s in json.load(open(configs.prompt))]
if "max_length" not in configs_dict:
Expand All @@ -119,8 +120,10 @@ def main():
if "max_length" not in configs_dict:
result = llm.generate("Three tips for staying healthy are: ")
else:
result = llm.generate("Three tips for staying healthy are: ", max_length=configs.max_length)

result = llm.generate(
"Three tips for staying healthy are: ", max_length=configs.max_length
)

llm.stop_server()


Expand Down
24 changes: 17 additions & 7 deletions inference/python/spec_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ def get_configs():
"tensor_parallelism_degree": 1,
"pipeline_parallelism_degree": 2,
"offload": False,
"offload_reserve_space_size": 8 * 1024, # 8GB
"offload_reserve_space_size": 8 * 1024, # 8GB
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"enable_peft": False,
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"profiling": False,
"benchmarking": False,
"inference_debugging": False,
Expand All @@ -81,6 +81,7 @@ def get_configs():
],
"prompt": "",
"output_file": "",
"max_length": 128,
}
# Merge dictionaries
ff_init_configs.update(llm_configs)
Expand Down Expand Up @@ -144,17 +145,26 @@ def main():
max_tokens_per_batch=64,
ssms=ssms,
)

llm.start_server()

if len(configs.prompt) > 0:
prompts = [s for s in json.load(open(configs.prompt))]
results = llm.generate(prompts)
if "max_length" not in configs_dict:
results = llm.generate(prompts)
else:
results = llm.generate(prompts, max_length=configs.max_length)
else:
result = llm.generate("Three tips for staying healthy are: ")

if "max_length" not in configs_dict:
result = llm.generate("Three tips for staying healthy are: ")
else:
result = llm.generate(
"Three tips for staying healthy are: ", max_length=configs.max_length
)

llm.stop_server()


if __name__ == "__main__":
print("flexflow inference example (speculative inference)")
main()
6 changes: 3 additions & 3 deletions src/ops/spec_inc_multihead_self_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
Layer const *layer,
std::vector<ParallelTensor> const &inputs) {

std::cout << "spec create operator: " << layer->name << "\n";
// std::cout << "spec create operator: " << layer->name << "\n";
long long value;
layer->get_int_property("embed_dim", value);
int embed_dim = value;
Expand All @@ -182,10 +182,10 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
int kdim = value;
layer->get_int_property("vdim", value);
int vdim = value;
float dropout;
layer->get_float_property("dropout", dropout);
layer->get_int_property("add_zero_attn", value);
bool add_zero_attn = (bool)value;
float dropout;
layer->get_float_property("dropout", dropout);
RotaryEmbeddingMeta rotary_embedding_meta;
layer->get_int_property("apply_rotary_embedding", value);
rotary_embedding_meta.apply_rotary_embedding = (bool)value;
Expand Down
6 changes: 4 additions & 2 deletions src/ops/tree_inc_multihead_self_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
rotary_embedding_meta.original_max_position_embeddings);
li->add_int_property("scaling_query", scaling_query);
li->add_float_property("scaling_factor", scaling_factor);
li->add_int_property("qk_prod_scaling", qk_prod_scaling);
li->add_int_property("position_bias", position_bias);
li->add_int_property("quantization_type", quantization_type);
li->add_int_property("offload", offload);
Expand All @@ -187,10 +188,10 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
int kdim = value;
layer->get_int_property("vdim", value);
int vdim = value;
float dropout;
layer->get_float_property("dropout", dropout);
layer->get_int_property("add_zero_attn", value);
bool add_zero_attn = (bool)value;
float dropout;
layer->get_float_property("dropout", dropout);
RotaryEmbeddingMeta rotary_embedding_meta;
layer->get_int_property("apply_rotary_embedding", value);
rotary_embedding_meta.apply_rotary_embedding = (bool)value;
Expand All @@ -203,6 +204,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
rotary_embedding_meta.high_freq_factor);
layer->get_int_property("original_max_position_embeddings", value);
rotary_embedding_meta.original_max_position_embeddings = (int)value;
layer->get_int_property("scaling_query", value);
bool scaling_query = (bool)value;
float scaling_factor;
layer->get_float_property("scaling_factor", scaling_factor);
Expand Down

0 comments on commit e56aede

Please sign in to comment.