From 5f02b7c45132f5d556d7826adce6531449280046 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 20:42:24 +0000 Subject: [PATCH 01/32] update script --- tests/peft/hf_serve.py | 70 ++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py index 1fde4d5a50..7bfc560cc2 100644 --- a/tests/peft/hf_serve.py +++ b/tests/peft/hf_serve.py @@ -1,6 +1,6 @@ import argparse import torch -import os, sys, shutil +import os, sys, shutil, json from peft import PeftModel, PeftConfig from transformers import ( AutoModelForCausalLM, @@ -40,11 +40,12 @@ def peft_post_forward_hook(module, input, output): def main(): parser = argparse.ArgumentParser() - parser.add_argument("--peft-model-id", type=str, default="./finetuned-llama") + parser.add_argument("--peft-model-id", type=str, required=True) parser.add_argument( "--use-full-precision", action="store_true", help="Use full precision" ) - parser.add_argument("--max-new-tokens", type=int, default=50) + parser.add_argument("--max-length", type=int, default=50) + parser.add_argument("--prompt-file", type=str, required=True) parser.add_argument("--do-sample", action="store_true", help="Use sampling") parser.add_argument( "--save-peft-tensors", @@ -52,24 +53,28 @@ def main(): help="Save PEFT hidden states and weights to file", ) args = parser.parse_args() - peft_model_id = args.peft_model_id - use_full_precision = args.use_full_precision - max_new_tokens = args.max_new_tokens - save_peft_tensors = args.save_peft_tensors - # Change working dir to folder storing this script - abspath = os.path.abspath(__file__) - dname = os.path.dirname(abspath) - os.chdir(dname) + # Check if prompt-file exists + if not os.path.isfile(args.prompt_file): + print(f"Error: {args.prompt_file} does not exist.") + return - config = PeftConfig.from_pretrained(peft_model_id) + # Get peft model config + config = PeftConfig.from_pretrained(args.peft_model_id) + + # Load the base model model = AutoModelForCausalLM.from_pretrained( config.base_model_name_or_path, return_dict=True, # load_in_8bit=True, - torch_dtype=torch.float32 if use_full_precision else torch.float16, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, device_map="auto", ) + # Load the Lora model + model = PeftModel.from_pretrained(model, args.peft_model_id) + print(model) + + # Get tokenizer hf_config = AutoConfig.from_pretrained( config.base_model_name_or_path, trust_remote_code=True ) @@ -78,25 +83,26 @@ def main(): tokenizer = LlamaTokenizer.from_pretrained( config.base_model_name_or_path, use_fast=True, - torch_dtype=torch.float32 if use_full_precision else torch.float16, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, ) else: tokenizer = AutoTokenizer.from_pretrained( config.base_model_name_or_path, - torch_dtype=torch.float32 if use_full_precision else torch.float16, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, ) + # Generation config generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path) generation_config.do_sample = args.do_sample - # Load the Lora model - model = PeftModel.from_pretrained(model, peft_model_id) - - print(model) # Register hooks to save tensors, if needed - if save_peft_tensors: + if args.save_peft_tensors: + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + # Create output dir shutil.rmtree("./hf_peft_tensors") - # Check that the output folder exists os.makedirs("./hf_peft_tensors", exist_ok=True) # Save weights for name, params in model.named_parameters(): @@ -112,12 +118,22 @@ def main(): layer.register_forward_pre_hook(peft_pre_forward_hook) layer.register_forward_hook(peft_post_forward_hook) - batch = tokenizer("Two things are infinite: ", return_tensors="pt") - with torch.cuda.amp.autocast(): - output_tokens = model.generate( - **batch, max_new_tokens=max_new_tokens, generation_config=generation_config - ) - print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False)) + # Run inference + # Read prompt-file into a list of strings + with open(args.prompt_file, "r") as f: + try: + prompt_list = json.load(f) + except json.JSONDecodeError: + print(f"Error: Unable to parse {args.prompt_file} as JSON.") + sys.exit(1) + + for i, prompt in enumerate(prompt_list): + batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) + with torch.cuda.amp.autocast(): + output_tokens = model.generate( + **batch, max_new_tokens=args.max_length, generation_config=generation_config + ) + print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False)) if __name__ == "__main__": From e82a75f3c46d51d848a2f6314c1daf99bac70b27 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 20 Feb 2024 16:56:28 +0000 Subject: [PATCH 02/32] less model renaming --- inference/models/falcon.cc | 34 ++----- inference/models/llama.cc | 104 ++++++++++------------ inference/models/mpt.cc | 35 +++----- inference/models/opt.cc | 38 +++----- inference/models/starcoder.cc | 26 ++---- python/flexflow/serve/__init__.py | 4 +- python/flexflow/serve/models/falcon.py | 32 ++++--- python/flexflow/serve/models/llama.py | 40 +++------ python/flexflow/serve/models/mpt.py | 34 +++---- python/flexflow/serve/models/opt.py | 36 ++++---- python/flexflow/serve/models/starcoder.py | 44 ++++----- python/flexflow/serve/serve.py | 34 ++++--- 12 files changed, 186 insertions(+), 275 deletions(-) diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index f86130ff2b..195d6ba7e3 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -76,7 +76,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.layer_norm_epsilon, true, DT_NONE, - std::string("layers_" + std::to_string(i) + "_input_layernorm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); } else { ff.residual_layer_norm( @@ -91,7 +91,7 @@ void FALCON::create_falcon_model(FFModel &ff, true, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_input_layernorm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); token = res_ln_outputs[0]; att_norm = res_ln_outputs[1]; @@ -117,7 +117,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -142,7 +142,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -167,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -188,7 +188,7 @@ void FALCON::create_falcon_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h") + std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h") .c_str()); dense_h_to_4h = ff.gelu(dense_h_to_4h); @@ -204,7 +204,7 @@ void FALCON::create_falcon_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h") + std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h") .c_str()); } // final normalization and linear @@ -254,26 +254,6 @@ void FALCON::create_falcon_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - falcon_config.n_head, - falcon_config.n_head_kv, - falcon_config.hidden_size, - falcon_config.hidden_size / falcon_config.n_head, - ff.config.tensor_parallelism_degree); - std::cout << "------load weights ----------" << std::endl; - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 0db7796567..a7a1758cc3 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -58,7 +58,7 @@ void LLAMA::create_llama_model(FFModel &ff, use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "tok_embeddings"); + "embed_tokens"); Tensor w2 = nullptr; @@ -75,7 +75,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.rms_norm_eps, llama_config.hidden_size, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_norm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); } else { ff.residual_rms_norm( @@ -86,7 +86,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size, false, // inplace_residual DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_norm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); token = token_att_norm[0]; att_norm = token_att_norm[1]; @@ -112,7 +112,7 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -135,7 +135,7 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -158,7 +158,7 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -178,59 +178,57 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size, false, // inplace_residual DT_NONE, - std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str()); + std::string("layers." + std::to_string(i) + ".post_attention_layernorm") + .c_str()); token = token_ff_norm[0]; Tensor ff_norm = token_ff_norm[1]; - Tensor w1 = - ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w1") - .c_str()); + Tensor w1 = ff.dense( + ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str()); - Tensor w3 = - ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w3") - .c_str()); + Tensor w3 = ff.dense( + ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str()); Tensor multi = ff.sigmoid_silu_multi(w1, w3); - w2 = - ff.dense(multi, - llama_config.hidden_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w2") - .c_str()); + w2 = ff.dense( + multi, + llama_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str()); // Low-Rank Adapter (LoRA) for the second linear layer ff.lora_linear( multi, w2, OP_LORA_MLP_SECOND, - std::string("layers_" + std::to_string(i) + "_feed_forward_w2_lora") + std::string("layers." + std::to_string(i) + ".mlp.down_proj.lora") .c_str()); } // final normalization and linear @@ -254,7 +252,7 @@ void LLAMA::create_llama_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - "output"); + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { @@ -288,16 +286,6 @@ void LLAMA::create_llama_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - im->compile_model_and_allocate_buffer(&ff); - fileloader.load_weights(&ff); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 95179691a1..e4a7e0056d 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -58,7 +58,7 @@ void MPT::create_mpt_model(FFModel &ff, use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wte"); + "wte"); Tensor intermediate_output = nullptr, layernorm_output = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; @@ -74,7 +74,7 @@ void MPT::create_mpt_model(FFModel &ff, 1e-05, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + std::string("layers." + std::to_string(i) + ".norm_1").c_str()); } else { ff.residual_layer_norm( intermediate_output, @@ -88,7 +88,7 @@ void MPT::create_mpt_model(FFModel &ff, false, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + std::string("layers." + std::to_string(i) + ".norm_1").c_str()); hidden_states = res_ln_outputs[0]; layernorm_output = res_ln_outputs[1]; } @@ -114,7 +114,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -138,7 +138,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -162,7 +162,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -184,7 +184,7 @@ void MPT::create_mpt_model(FFModel &ff, false, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_2").c_str()); + std::string("layers." + std::to_string(i) + ".norm_2").c_str()); hidden_states = res_ln_outputs[0]; layernorm_output = res_ln_outputs[1]; @@ -200,7 +200,7 @@ void MPT::create_mpt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str()); + std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str()); layernorm_output = ff.gelu(layernorm_output); intermediate_output = ff.dense( layernorm_output, @@ -213,7 +213,7 @@ void MPT::create_mpt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str()); + std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str()); } // final @@ -228,7 +228,7 @@ void MPT::create_mpt_model(FFModel &ff, false, false, DT_NONE, - "transformer_norm_f"); + "norm_f"); Tensor all_final_norm = res_ln_outputs[1]; Tensor lm_head = ff.dense(all_final_norm, @@ -262,21 +262,6 @@ void MPT::create_mpt_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - //------------------- compile the model -------------------------------- - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - mpt_config.n_heads, - mpt_config.n_heads, - mpt_config.hidden_size, - mpt_config.hidden_size / mpt_config.n_heads, - ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 7d2abad829..6d04ba47f2 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -96,7 +96,7 @@ void OPT::create_opt_model(FFModel &ff, true, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_layer_norm") + std::string("layers." + std::to_string(i) + ".self_attn_layer_norm") .c_str()); Tensor residual = res_ln_outputs[0]; Tensor hidden_states = res_ln_outputs[1]; @@ -122,7 +122,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -146,7 +146,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -170,7 +170,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -189,8 +189,8 @@ void OPT::create_opt_model(FFModel &ff, true, false, DT_NONE, - std::string("layers_" + std::to_string(i) + - "_add_bias_residual_layer_norm") + std::string("layers." + std::to_string(i) + + ".add_bias_residual_layer_norm") .c_str()); added = res_ln_outputs[0]; Tensor final_norm = res_ln_outputs[1]; @@ -207,7 +207,7 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_fc1").c_str()); + std::string("layers." + std::to_string(i) + ".fc1").c_str()); fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, @@ -218,13 +218,13 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_fc2").c_str()); + std::string("layers." + std::to_string(i) + ".fc2").c_str()); // Low-Rank Adapter (LoRA) for the second linear layer ff.lora_linear( fc1, fc2, OP_LORA_MLP_SECOND, - std::string("layers_" + std::to_string(i) + "_fc2_lora").c_str()); + std::string("layers." + std::to_string(i) + ".fc2.lora").c_str()); } // final @@ -252,7 +252,7 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - "embed_tokens_weight_lm_head"); + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { @@ -276,24 +276,6 @@ void OPT::create_opt_model(FFModel &ff, use_full_precision); InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - //------------------- compile the model -------------------------------- - std::cout << "------start compile ----------" << std::endl; - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - opt_config.num_attention_heads, - opt_config.num_attention_heads, - opt_config.hidden_size, - opt_config.hidden_size / - opt_config.num_attention_heads, - ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------finished loading weights----------" << std::endl; - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index fb6269ad75..cd8bf3a9a7 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -66,7 +66,7 @@ void STARCODER::create_starcoder_model( use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wte"); + "wte"); Tensor positional_embedding = ff.embedding(position_input, @@ -76,7 +76,7 @@ void STARCODER::create_starcoder_model( use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wpe"); + "wpe"); Tensor residual = nullptr, c_proj = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; @@ -98,7 +98,7 @@ void STARCODER::create_starcoder_model( true, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ln_1").c_str()); + std::string("layers." + std::to_string(i) + ".ln_1").c_str()); Tensor hidden_states = res_ln_outputs[0]; Tensor ln_1 = res_ln_outputs[1]; @@ -125,7 +125,7 @@ void STARCODER::create_starcoder_model( 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn.c_attn") .c_str() /*name*/ ); break; @@ -147,7 +147,7 @@ void STARCODER::create_starcoder_model( true, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ln_2").c_str()); + std::string("layers." + std::to_string(i) + ".ln_2").c_str()); residual = res_ln_outputs[0]; Tensor l2_norm = res_ln_outputs[1]; @@ -163,7 +163,7 @@ void STARCODER::create_starcoder_model( nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str()); + std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str()); c_fc = ff.gelu(c_fc); @@ -178,7 +178,7 @@ void STARCODER::create_starcoder_model( nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str()); + std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str()); } // final normalization and linear ff.residual_layer_norm(residual, @@ -192,7 +192,7 @@ void STARCODER::create_starcoder_model( true, false, DT_NONE, - "transformer_ln_f"); + "ln_f"); Tensor ln_f = res_ln_outputs[1]; Tensor lm_head = ff.dense(ln_f, @@ -235,16 +235,6 @@ void STARCODER::create_starcoder_model( ff.config.tensor_parallelism_degree, use_full_precision); im->register_model_weights_loader(&ff, fileloader); -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - im->compile_model_and_allocate_buffer(&ff); - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 5805670ae0..da7dba5bcc 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -214,7 +214,7 @@ def init( if configs_dict.get("offload", None) is None: configs_dict["offload"] = False if configs_dict.get("offload_reserve_space_size", None) is None: - configs_dict["offload_reserve_space_size"] = 8*1024**3 + configs_dict["offload_reserve_space_size"] = 8 * 1024**3 if configs_dict.get("use_4bit_quantization", None) is None: configs_dict["use_4bit_quantization"] = False if configs_dict.get("use_8bit_quantization", None) is None: @@ -222,7 +222,7 @@ def init( if configs_dict.get("enable_peft", None) is None: configs_dict["enable_peft"] = False if configs_dict.get("peft_activation_reserve_space_size", None) is None: - configs_dict["peft_activation_reserve_space_size"] = 8*1024**3 + configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3 if configs_dict.get("peft_weight_reserve_space_size", None) is None: configs_dict["peft_weight_reserve_space_size"] = 1024**3 if configs_dict.get("profiling", None) is None: diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index db2f403e10..e4d7786262 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -118,7 +118,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.falcon_config.layer_norm_epsilon, - name=f"layers_{i}_input_layernorm", + name=f"layers.{i}.input_layernorm", ) else: token, att_norm = ffmodel.residual_layer_norm( @@ -129,7 +129,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.falcon_config.layer_norm_epsilon, - name=f"layers_{i}_input_layernorm", + name=f"layers.{i}.input_layernorm", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -147,7 +147,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attention", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( @@ -164,7 +164,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attention", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multiquery_self_attention( @@ -181,7 +181,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attention", ) else: assert False @@ -191,7 +191,7 @@ def build_model(self, max_tokens_per_batch): self.falcon_config.hidden_size * 4, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_mlp_dense_h_to_4h", + name=f"layers.{i}.mlp.dense_h_to_4h", ) dense_h_to_4h = ffmodel.gelu(dense_h_to_4h) mlp_output = ffmodel.dense( @@ -199,7 +199,7 @@ def build_model(self, max_tokens_per_batch): self.falcon_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_mlp_dense_4h_to_h", + name=f"layers.{i}.mlp.dense_4h_to_h", ) _, ln_f = ffmodel.residual_layer_norm( @@ -239,11 +239,9 @@ def build_model(self, max_tokens_per_batch): # TODO: finish this def convert_hf_weight_name(name): - return ( - name.replace(".", "_") - .replace("transformer_h_", "layers_") - .replace("transformer_", "") - .replace("self_attention_dense", "attention_wo") + return (name.replace("transformer.h.", "layers.") + .replace("transformer.", "") + .replace("self_attention.dense", "self_attention.o_proj") ) def convert_hf_model(model, dst_folder): @@ -256,10 +254,10 @@ def convert_hf_model(model, dst_folder): for name, params in model.named_parameters(): name = FlexFlowFalcon.convert_hf_weight_name(name) # Split Q,K,V attention weights - if "self_attention_query_key_value" in name: - name_q = name.replace("self_attention_query_key_value", "attention_wq") - name_k = name.replace("self_attention_query_key_value", "attention_wk") - name_v = name.replace("self_attention_query_key_value", "attention_wv") + if "self_attention.query_key_value" in name: + name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj") + name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj") + name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj") q, k, v = torch.split( params, [ @@ -276,5 +274,5 @@ def convert_hf_model(model, dst_folder): params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) # LM head weight model.lm_head.weight.detach().cpu().numpy().tofile( - os.path.join(dst_folder, "lm_head_weight") + os.path.join(dst_folder, "lm_head.weight") ) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index cd9cf29ebf..6aef540342 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -101,7 +101,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="tok_embeddings", + name="embed_tokens", ) for i in range(self.llama_config.num_hidden_layers): @@ -112,7 +112,7 @@ def build_model(self, max_tokens_per_batch): token, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_attention_norm", + name=f"layers.{i}.input_layernorm", ) else: token, attn_norm = ffmodel.residual_rms_norm( @@ -120,7 +120,7 @@ def build_model(self, max_tokens_per_batch): w2, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_attention_norm", + name=f"layers.{i}.input_layernorm", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -140,7 +140,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( @@ -159,7 +159,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multiquery_self_attention( @@ -178,7 +178,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) else: assert False @@ -188,21 +188,21 @@ def build_model(self, max_tokens_per_batch): mha, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_ffn_norm", + name=f"layers.{i}.post_attention_layernorm", ) w1 = ffmodel.dense( ff_norm, self.llama_config.intermediate_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w1", + name=f"layers.{i}.mlp.gate_proj", ) w3 = ffmodel.dense( ff_norm, self.llama_config.intermediate_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w3", + name=f"layers.{i}.mlp.up_proj", ) multi = ffmodel.sigmoid_silu_multi(w1, w3) w2 = ffmodel.dense( @@ -210,7 +210,7 @@ def build_model(self, max_tokens_per_batch): self.llama_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w2", + name=f"layers.{i}.mlp.down_proj", ) _, token = ffmodel.residual_rms_norm( @@ -225,7 +225,7 @@ def build_model(self, max_tokens_per_batch): self.llama_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="output", + name="lm_head", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -246,23 +246,7 @@ def build_model(self, max_tokens_per_batch): self.ffmodel = ffmodel def convert_hf_weight_name(name): - return ( - name.replace(".", "_") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("o_proj", "wo") - .replace("mlp", "feed_forward") - .replace("gate_proj", "w1") - .replace("down_proj", "w2") - .replace("up_proj", "w3") - .replace("input_layernorm", "attention_norm") - .replace("post_attention_layernorm", "ffn_norm") - .replace("embed_tokens", "tok_embeddings") - .replace("lm_head", "output") - .replace("model_", "") - ) + return name.replace("model.", "") def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 9168932ce1..76f7d69c73 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -92,7 +92,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wte", + name="wte", ) axes = [ @@ -109,7 +109,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_1", + name=f"layers.{i}.norm_1", ) else: hidden_states, layernorm_output = ffmodel.residual_layer_norm( @@ -121,7 +121,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_1", + name=f"layers.{i}.norm_1", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -143,7 +143,7 @@ def build_model(self, max_tokens_per_batch): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: attn_outputs = ffmodel.inc_multihead_self_attention_verify( @@ -164,7 +164,7 @@ def build_model(self, max_tokens_per_batch): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: attn_outputs = ffmodel.inc_multihead_self_attention( @@ -185,7 +185,7 @@ def build_model(self, max_tokens_per_batch): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) else: assert False @@ -199,7 +199,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_2", + name=f"layers.{i}.norm_2", ) # mlp layernorm_output = ffmodel.dense( @@ -207,7 +207,7 @@ def build_model(self, max_tokens_per_batch): 4 * self.mpt_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_ffn_up_proj", + name=f"layers.{i}.ffn.up_proj", ) layernorm_output = ffmodel.gelu(layernorm_output) intermediate_output = ffmodel.dense( @@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch): self.mpt_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_ffn_down_proj", + name=f"layers.{i}.ffn.down_proj", ) _, all_final_norm = ffmodel.residual_layer_norm( @@ -227,7 +227,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"transformer_norm_f", + name=f"norm_f", ) lm_head = ffmodel.dense( all_final_norm, @@ -252,8 +252,8 @@ def build_model(self, max_tokens_per_batch): def convert_hf_weight_name(name): return ( name.replace("transformer.blocks.", "layers.") - .replace(".", "_") - .replace("attn_out_proj", "attention_wo") + .replace("transformer.", "") + .replace("attn.out_proj", "attn.o_proj") ) def convert_hf_model(model, dst_folder): @@ -261,9 +261,9 @@ def convert_hf_model(model, dst_folder): for name, params in model.named_parameters(): name = FlexFlowMPT.convert_hf_weight_name(name) if "Wqkv" in name: - name_q = name.replace("attn_Wqkv", "attention_wq") - name_k = name.replace("attn_Wqkv", "attention_wk") - name_v = name.replace("attn_Wqkv", "attention_wv") + name_q = name.replace("attn.Wqkv", "attn.q_proj") + name_k = name.replace("attn.Wqkv", "attn.k_proj") + name_v = name.replace("attn.Wqkv", "attn.v_proj") q, k, v = torch.split( params, [ @@ -280,6 +280,6 @@ def convert_hf_model(model, dst_folder): params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) shutil.copy( - os.path.join(dst_folder, "transformer_wte_weight"), - os.path.join(dst_folder, "lm_head_weight"), + os.path.join(dst_folder, "wte.weight"), + os.path.join(dst_folder, "lm_head.weight"), ) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index 9a03cf6e78..51c76c520b 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -133,7 +133,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_attention_layer_norm", + name=f"layers.{i}.self_attn_layer_norm", ) else: hidden_states = ffmodel.add(token, positional_embedding) @@ -157,7 +157,7 @@ def build_model(self, max_tokens_per_batch): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multihead_self_attention_verify( @@ -177,7 +177,7 @@ def build_model(self, max_tokens_per_batch): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multihead_self_attention( @@ -197,7 +197,7 @@ def build_model(self, max_tokens_per_batch): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) else: assert False @@ -209,7 +209,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_add_bias_residual_layer_norm", + name=f"layers.{i}.add_bias_residual_layer_norm", ) if not self.opt_config.do_layer_norm_before: @@ -220,14 +220,14 @@ def build_model(self, max_tokens_per_batch): self.opt_config.ffn_dim, ActiMode.AC_MODE_RELU, True, - name=f"layers_{i}_fc1", + name=f"layers.{i}.fc1", ) fc2 = ffmodel.dense( fc1, self.opt_config.hidden_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_fc2", + name=f"layers.{i}.fc2", ) if not self.opt_config.do_layer_norm_before: @@ -239,7 +239,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_final_layer_norm", + name=f"layers.{i}.final_layer_norm", ) _, all_final_norm = ffmodel.residual_layer_norm( @@ -257,7 +257,7 @@ def build_model(self, max_tokens_per_batch): self.opt_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="embed_tokens_weight_lm_head", + name="lm_head", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -279,17 +279,11 @@ def build_model(self, max_tokens_per_batch): def convert_hf_weight_name(name): return ( - name.replace(".", "_") - .replace("decoder_", "") - .replace("model_", "") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("out_proj", "wo") - .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias") + name.replace("decoder.", "") + .replace("model.", "") + .replace("self_attn.wo.bias", "add_bias_residual_layer_norm.attn_bias") .replace( - "_final_layer_norm", "_add_bias_residual_layer_norm" + ".final_layer_norm", ".add_bias_residual_layer_norm" ) # important to use the leading "_" to avoid matching the last LayerNorm ) @@ -300,6 +294,6 @@ def convert_hf_model(model, dst_folder): params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") # copy embedding weights shutil.copy( - os.path.join(dst_folder, "embed_tokens_weight"), - os.path.join(dst_folder, "embed_tokens_weight_lm_head"), + os.path.join(dst_folder, "embed_tokens.weight"), + os.path.join(dst_folder, "lm_head.weight"), ) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index cd6a7304e6..8ed8fcfa18 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -106,7 +106,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wte", + name="wte", ) positional_embedding = ffmodel.embedding( position_tensor, @@ -116,7 +116,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wpe", + name="wpe", ) axes = [ @@ -134,7 +134,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"layers_{i}_ln_1", + name=f"layers.{i}.ln_1", ) assert self.mode == InferenceMode.INC_DECODING_MODE @@ -154,7 +154,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer False, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.attn.c_attn", ) residual, l2_norm = ffmodel.residual_layer_norm( @@ -166,7 +166,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"layers_{i}_ln_2", + name=f"layers.{i}.ln_2", ) # mlp @@ -176,7 +176,7 @@ def build_model(self, max_tokens_per_batch): self.starcoder_config.intermediate_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_mlp_c_fc", + name=f"layers.{i}.mlp.c_fc", ) activation = ffmodel.gelu(c_fc, False) c_proj = ffmodel.dense( @@ -184,7 +184,7 @@ def build_model(self, max_tokens_per_batch): self.starcoder_config.hidden_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_mlp_c_proj", + name=f"layers.{i}.mlp.c_proj", ) _, ln_f = ffmodel.residual_layer_norm( @@ -195,7 +195,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"transformer_ln_f", + name=f"ln_f", ) lm_head = ffmodel.dense( ln_f, @@ -219,11 +219,11 @@ def build_model(self, max_tokens_per_batch): def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = name.replace("transformer.h", "layers").replace(".", "_") - if "c_attn_weight" in name: - name_q = name.replace("attn_c_attn", "attention_wq") - name_k = name.replace("attn_c_attn", "attention_wk") - name_v = name.replace("attn_c_attn", "attention_wv") + name = name.replace("transformer.h", "layers").replace("transformer", "") + if "attn.c_attn.weight" in name: + name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj") + name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj") + name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj") q, k, v = torch.split( params, [ @@ -236,10 +236,10 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "c_attn_bias" in name: - name_q = name.replace("attn_c_attn", "attention_wq") - name_k = name.replace("attn_c_attn", "attention_wk") - name_v = name.replace("attn_c_attn", "attention_wv") + elif "attn.c_attn.bias" in name: + name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj") + name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj") + name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj") q, k, v = torch.split( params, [ @@ -252,14 +252,14 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "c_proj_bias" in name: - name = name.replace("attn_c_proj", "attention_wo") + elif "attn.c_proj.bias" in name: + name = name.replace("attn.c_proj", "attn.c_attn.o_proj") params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) - elif "c_proj_weight" in name: - name = name.replace("attn_c_proj", "attention_wo") + elif "attn.c_proj.weight" in name: + name = name.replace("attn.c_proj", "attn.c_attn.o_proj") params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) else: params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) model.lm_head.weight.detach().cpu().numpy().tofile( - os.path.join(dst_folder, "lm_head_weight") + os.path.join(dst_folder, "lm_head.weight") ) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index da2f1246a2..e20a8760cf 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -186,9 +186,11 @@ def download_hf_weights_if_needed(self): os.path.expanduser(self.cache_path), "weights", self.model_name.lower(), - "full-precision" - if self.data_type == DataType.DT_FLOAT - else "half-precision", + ( + "full-precision" + if self.data_type == DataType.DT_FLOAT + else "half-precision" + ), ) if self.refresh_cache: print( @@ -219,9 +221,11 @@ def download_hf_weights_if_needed(self): hf_model = AutoModelForCausalLM.from_pretrained( self.model_name, trust_remote_code=True, - torch_dtype=torch.float32 - if self.data_type == DataType.DT_FLOAT - else torch.float16, + torch_dtype=( + torch.float32 + if self.data_type == DataType.DT_FLOAT + else torch.float16 + ), ) # Print log message to notify user download of model has finished if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): @@ -575,11 +579,13 @@ def download_hf_config(self): print(f"Creating directory {self.config_dir} (if it doesn't exist)...") print(f"Saving {self.peft_model_id} configs to file {self.config_path}...") with open(self.config_path, "w") as json_file: + class SetEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, set): return list(obj) return super().default(obj) + json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder) def __get_revision_hashes(self, peft_model_id: str): @@ -619,9 +625,11 @@ def download_hf_weights_if_needed(self): os.path.expanduser(self.cache_path), "weights", self.peft_model_id.lower(), - "full-precision" - if self.data_type == DataType.DT_FLOAT - else "half-precision", + ( + "full-precision" + if self.data_type == DataType.DT_FLOAT + else "half-precision" + ), ) if self.refresh_cache: print( @@ -658,9 +666,11 @@ def download_hf_weights_if_needed(self): self.hf_config.base_model_name_or_path, return_dict=True, trust_remote_code=True, - torch_dtype=torch.float32 - if self.data_type == DataType.DT_FLOAT - else torch.float16, + torch_dtype=( + torch.float32 + if self.data_type == DataType.DT_FLOAT + else torch.float16 + ), # device_map="auto", ) hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id) From 583cb28df18726d520d86b0cceb0fd926fbf0bc1 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 20 Feb 2024 18:20:57 +0000 Subject: [PATCH 03/32] fix --- src/runtime/file_loader.cc | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index fa19c9b22d..dfa3748b9a 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -219,10 +219,10 @@ void load_attention_weights_v2(DT *ptr, int tensor_parallelism_degree) { // layers_0_attention_wq_weight // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file}; int file_index = 0; @@ -409,10 +409,10 @@ void load_attention_weights_quantized(char *ptr, bool use_full_precision) { // layers_0_attention_wq_weight // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file, o_file}; int file_index = 0; @@ -690,7 +690,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff, if (weight_idx > 0) { assert(weight_idx == 0 || weight_idx == 1); if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } } load_from_quantized_file(data, @@ -728,12 +728,9 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { - if (weight_filename.find("self_attention") != std::string::npos) { - load_attention_weights_multi_query( - data, weight_filename, weights_folder, hidden_dim, num_heads); - } else if (weight_filename.find("attention") != std::string::npos && - weight_filename.rfind("attention") == - weight_filename.length() - strlen("attention")) { + if (weight_filename.find("attention") != std::string::npos && + weight_filename.rfind("attention") == + weight_filename.length() - strlen("attention")) { if (weight_idx == 0) { load_attention_weights_v2(data, num_heads, @@ -765,7 +762,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, assert(weight_idx >= 0 || weight_idx <= 2); weight_filename += (weight_idx == 0) ? "_attn_bias" - : ((weight_idx == 1) ? "_weight" : "_bias"); + : ((weight_idx == 1) ? ".weight" : ".bias"); std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = join_path({weights_folder, weight_filename}); load_from_file(data, volume, weight_filepath); @@ -774,7 +771,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, assert(weight_idx == 0 || weight_idx == 1); // handle exception if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = join_path({weights_folder, weight_filename}); From 38ed49c1fd09898e4a058f1554cc1cce731eca26 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 21 Feb 2024 21:23:05 +0000 Subject: [PATCH 04/32] fix --- src/runtime/file_loader.cc | 49 ++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index dfa3748b9a..596c441123 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -728,35 +728,28 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { - if (weight_filename.find("attention") != std::string::npos && - weight_filename.rfind("attention") == - weight_filename.length() - strlen("attention")) { - if (weight_idx == 0) { - load_attention_weights_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - weight_filename, - weights_folder, - volume, - tensor_parallelism_degree); - } else { - long long value; - l->get_int_property("final_bias", value); - bool final_bias = (bool)value; - load_attention_bias_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - final_bias, - weight_filename, - weights_folder); - } - + if (weight_idx == 0) { + load_attention_weights_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree); } else { - assert(false); + long long value; + l->get_int_property("final_bias", value); + bool final_bias = (bool)value; + load_attention_bias_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + final_bias, + weight_filename, + weights_folder); } } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { assert(weight_idx >= 0 || weight_idx <= 2); From 0387d51616573f8c1e946d60f80c976b64d94672 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 21 Feb 2024 23:03:11 +0000 Subject: [PATCH 05/32] fix --- python/flexflow/serve/models/opt.py | 3 ++- src/runtime/file_loader.cc | 14 +++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index 51c76c520b..f725a08e65 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -281,7 +281,8 @@ def convert_hf_weight_name(name): return ( name.replace("decoder.", "") .replace("model.", "") - .replace("self_attn.wo.bias", "add_bias_residual_layer_norm.attn_bias") + .replace("self_attn.out_proj", "self_attn.o_proj") + .replace("self_attn.o_proj.bias", "add_bias_residual_layer_norm.attn_bias") .replace( ".final_layer_norm", ".add_bias_residual_layer_norm" ) # important to use the leading "_" to avoid matching the last LayerNorm diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index 596c441123..835012edc1 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -136,12 +136,12 @@ void load_attention_bias_v2(DT *ptr, bool final_bias, std::string layer_name, std::string weights_folder) { - std::string q_file = layer_name + "_wq_bias"; - std::string k_file = layer_name + "_wk_bias"; - std::string v_file = layer_name + "_wv_bias"; + std::string q_file = layer_name + ".q_proj.bias"; + std::string k_file = layer_name + ".k_proj.bias"; + std::string v_file = layer_name + ".v_proj.bias"; std::vector bias_files = {q_file, k_file, v_file}; if (final_bias) { - std::string o_file = layer_name + "_wo_bias"; + std::string o_file = layer_name + ".o_proj.bias"; bias_files.push_back(o_file); } @@ -217,8 +217,6 @@ void load_attention_weights_v2(DT *ptr, std::string weights_folder, size_t volume, int tensor_parallelism_degree) { - // layers_0_attention_wq_weight - // layers_0_self_attn_q_proj_weight std::string q_file = layer_name + ".q_proj.weight"; std::string k_file = layer_name + ".k_proj.weight"; std::string v_file = layer_name + ".v_proj.weight"; @@ -407,8 +405,6 @@ void load_attention_weights_quantized(char *ptr, std::string weights_folder, DataType data_type, bool use_full_precision) { - // layers_0_attention_wq_weight - // layers_0_self_attn_q_proj_weight std::string q_file = layer_name + ".q_proj.weight"; std::string k_file = layer_name + ".k_proj.weight"; std::string v_file = layer_name + ".v_proj.weight"; @@ -754,7 +750,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { assert(weight_idx >= 0 || weight_idx <= 2); weight_filename += (weight_idx == 0) - ? "_attn_bias" + ? ".attn_bias" : ((weight_idx == 1) ? ".weight" : ".bias"); std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = join_path({weights_folder, weight_filename}); From 4f59521b2d8b41aacb9567daae7561d358ef036a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 22 Feb 2024 03:56:30 +0000 Subject: [PATCH 06/32] backup --- python/flexflow/serve/serve.py | 172 ++++++++++++++++++++++++--------- 1 file changed, 129 insertions(+), 43 deletions(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index e20a8760cf..f052b21033 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -199,7 +199,7 @@ def download_hf_weights_if_needed(self): if os.path.exists(self.weights_path): shutil.rmtree(self.weights_path) os.makedirs(self.weights_path, exist_ok=True) - print(f"Creating directory {self.weights_path} (if it doesn't exist)...") + #print(f"Creating directory {self.weights_path} (if it doesn't exist)...") ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( self.model_name, weights=True @@ -451,17 +451,6 @@ def stop_server(self): self.rm.stop_server() print("Background server stopped.") - def __enter__(self): - # Start the server when entering the context - # self.rm.start_server(self.model.ffmodel) - return self - - def __exit__(self, exc_type, exc_value, traceback): - # Stop the server when exiting the context - # self.rm.stop_server() - if exc_type: - print(f"Exception occurred: {exc_value}") - class SSM(LLM): """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace""" @@ -539,18 +528,20 @@ def compile( ) -class PEFT: +class PEFT(LLM): """This class creates a PEFT (parameter-efficient transformer) object to be used in concert with a LLM or SSM""" def __init__( self, + base_model: LLM, peft_model_id: str, + config: PeftConfig = None, data_type: DataType = DataType.DT_HALF, cache_path: str = "", refresh_cache: bool = False, ): - self.hf_config = PeftConfig.from_pretrained(peft_model_id) self.peft_model_id = peft_model_id + self.hf_config = config if config is not None else PeftConfig.from_pretrained(peft_model_id) self.peft_type = self.hf_config.peft_type if self.peft_type != "LORA": raise RuntimeError( @@ -565,9 +556,9 @@ def __init__( raise ValueError( f"PEFT model {peft_model_id} does not have an associated based model" ) - self.base_model = LLM( - self.hf_config.base_model_name_or_path, data_type, cache_path, refresh_cache - ) + self.base_model = base_model + if refresh_cache: + self.base_model.refresh_cache = True def download_hf_config(self): """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" @@ -587,25 +578,11 @@ def default(self, obj): return super().default(obj) json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder) + + self.base_model.download_hf_config() def __get_revision_hashes(self, peft_model_id: str): - ff_revision = None - ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt") - if os.path.exists(ff_revision_file): - ff_revision = "".join(open(ff_revision_file).read().split()) - - if os.path.exists(peft_model_id) and os.path.isdir(peft_model_id): - # Local model - files = os.listdir(peft_model_id) - state = files + [ - os.path.getmtime(os.path.join(peft_model_id, f)) for f in files - ] - latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest() - else: - # Remote HuggingFace model - hf_api = HfApi() - latest_revision = hf_api.model_info(self.peft_model_id).sha - return ff_revision, ff_revision_file, latest_revision + return super().__get_revision_hashes(peft_model_id, weights=True) def convert_peft_model(self, hf_peft_model, weights_path): for name, params in hf_peft_model.named_parameters(): @@ -620,6 +597,8 @@ def download_hf_weights_if_needed(self): """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date. If not, or if the refresh_cache parameter is set to True, download new weights. """ + self.base_model.download_hf_weights_if_needed() + # Use local cache, or download new version self.weights_path = os.path.join( os.path.expanduser(self.cache_path), @@ -638,7 +617,7 @@ def download_hf_weights_if_needed(self): if os.path.exists(self.weights_path): shutil.rmtree(self.weights_path) os.makedirs(self.weights_path, exist_ok=True) - print(f"Creating directory {self.weights_path} (if it doesn't exist)...") + #print(f"Creating directory {self.weights_path} (if it doesn't exist)...") ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( self.peft_model_id @@ -649,21 +628,15 @@ def download_hf_weights_if_needed(self): if not os.path.exists(self.peft_model_id) or os.path.isdir( self.peft_model_id ): - # Local model print( f"'{self.peft_model_id}' model weights not found in cache or outdated. Downloading from huggingface.co ..." ) else: - # Remote model print( f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..." ) - # Download base model from HuggingFace, or load it from the local folder - self.base_model.download_hf_weights_if_needed() - self.base_model.download_hf_tokenizer_if_needed() - self.base_model.download_hf_config() hf_base_model = AutoModelForCausalLM.from_pretrained( - self.hf_config.base_model_name_or_path, + self.base_model.model_name, return_dict=True, trust_remote_code=True, torch_dtype=( @@ -673,7 +646,7 @@ def download_hf_weights_if_needed(self): ), # device_map="auto", ) - hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id) + hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id, config=self.hf_config) # Print log message to notify user download of model has finished if not os.path.exists(self.peft_model_id) or os.path.isdir( self.peft_model_id @@ -692,3 +665,116 @@ def download_hf_weights_if_needed(self): torch.cuda.empty_cache() else: print(f"Loading '{self.peft_model_id}' model weights from the cache...") + + def download_hf_tokenizer_if_needed(self): + self.base_model.download_hf_tokenizer_if_needed() + + def compile( + self, + generation_config: GenerationConfig = GenerationConfig(), + max_requests_per_batch: int = 1, + max_seq_length: int = 256, + max_tokens_per_batch: int = 64, + model_specific_data_parallelism_degree: int = None, + model_specific_tensor_parallelism_degree: int = None, + model_specific_pipeline_parallelism_degree: int = None, + ssms: list = [], + ): + self.base_model.ssms = ssms + self.base_model.generation_config = GenerationConfig() + self.base_model.ffconfig = FFConfig() + if len(ssms) > 0: + assert type(self.base_model) == LLM + mode = InferenceMode.TREE_VERIFY_MODE + elif type(self.base_model) == SSM: + mode = InferenceMode.BEAM_SEARCH_MODE + else: + assert type(self.base_model) == LLM + mode = InferenceMode.INC_DECODING_MODE + + # Apply model-specific parallelism degrees, if needed + if model_specific_data_parallelism_degree: + self.base_model.ffconfig.data_parallelism_degree = ( + model_specific_data_parallelism_degree + ) + if model_specific_tensor_parallelism_degree: + self.base_model.ffconfig.tensor_parallelism_degree = ( + model_specific_tensor_parallelism_degree + ) + if model_specific_pipeline_parallelism_degree: + self.base_model.ffconfig.pipeline_parallelism_degree = ( + model_specific_pipeline_parallelism_degree + ) + + # Create request manager and set serving configuration + self.base_model.rm = RequestManager() + self.base_model.rm.set_max_requests_per_batch(max_requests_per_batch) + self.base_model.rm.set_max_tokens_per_batch(max_tokens_per_batch) + self.base_model.rm.set_max_sequence_length(max_seq_length) + + # Instantiate the relevant model + self.base_model.model = self.model_class( + mode, + generation_config, + self.base_model.ffconfig, + self.base_model.hf_config, + self.base_model.data_type, + max_tokens_per_batch, + ) + + # TODO: add linear layers + + # Download the weights from huggingface (if needed) + self.download_hf_weights_if_needed() + + # Create file data loader, load weights into tensors + model_configs = self.base_model.config_class(self.base_model.hf_config) + + self.fileloader = FileDataLoader( + self.weights_path, + model_configs.num_attention_heads, + model_configs.num_key_value_heads, + model_configs.hidden_size, + model_configs.hidden_size // model_configs.num_attention_heads, + self.ffconfig.tensor_parallelism_degree, + self.data_type == DataType.DT_FLOAT, + ) + + # Register weights file loader + self.im = InferenceManager() + self.im.register_model_weights_loader(self.model.ffmodel, self.fileloader) + + # Download the tokenizer from huggingface (if needed) and load them + self.download_hf_tokenizer_if_needed() + + # Create tokenizer (this must be done after we have downloaded the tokenizer + bos_token_id = ( + -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id + ) + eos_token_id = ( + -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id + ) + self.rm.register_tokenizer( + self.model_type, bos_token_id, eos_token_id, self.tokenizer_path + ) + self.rm.register_output_filepath(self.output_file) + + for ssm in self.ssms: + self.rm.register_ssm_model(ssm.model.ffmodel) + + # start background server + if (mode == InferenceMode.TREE_VERIFY_MODE) or ( + mode == InferenceMode.INC_DECODING_MODE + ): + import atexit + + atexit.register(self.rm.stop_server) + + def generate(self, prompts: Union[str, List[str]], max_length: int = 128): + super().generate(prompts, max_length) + + def start_server(self): + self.base_model.start_server() + + def stop_server(self): + self.base_model.stop_server() From 37451d6a187d5a8f1205825a2a1988f1130d0110 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 23 Feb 2024 02:43:12 +0000 Subject: [PATCH 07/32] . --- include/flexflow/ffconst.h | 3 +- include/flexflow/model.h | 7 ++--- include/flexflow/ops/lora_linear_params.h | 7 +++-- inference/incr_decoding/incr_decoding.cc | 33 +++++++++++++------- inference/models/llama.cc | 7 +---- inference/models/opt.cc | 6 +--- python/flexflow/serve/serve.py | 2 +- src/ops/fused.cu | 6 ++-- src/ops/lora_linear.cc | 26 ++++++++++++++-- src/ops/lora_linear_params.cc | 37 +++++++++++++++++------ src/runtime/ffconst_utils.cc | 6 ++-- src/runtime/file_loader.cc | 2 +- src/runtime/graph.cc | 3 +- src/runtime/model.cc | 4 +-- src/runtime/request_manager.cc | 26 ++++++++-------- 15 files changed, 106 insertions(+), 69 deletions(-) diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index fb12adf2d3..66e252db46 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -179,8 +179,7 @@ enum OperatorType { OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, OP_SAMPLING, // PEFT Ops - OP_LORA_MLP_FIRST, - OP_LORA_MLP_SECOND, + OP_LORA, // Parallel Ops OP_REPARTITION, OP_COMBINE, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 34ace0c5dc..2dd0dbf686 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -837,18 +837,16 @@ class FFModel { // ======================================== // PEFT Layers // ======================================== + void add_lora_layer(std::string target_module_name, char const *name); void lora_linear(Tensor const input, Tensor const output, - OperatorType _type, char const *name = nullptr); // ======================================== // Inference APIs // ======================================== std::vector generate(std::vector const &requests); - PEFTModelID register_peft_model( - LoraLinearConfig const mlp_first = LoraLinearConfig::DefaultConfig, - LoraLinearConfig const mlp_second = LoraLinearConfig::DefaultConfig); + PEFTModelID register_peft_model(LoraLinearConfig const peft_config); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], @@ -1173,6 +1171,7 @@ class FFModel { std::vector layers; std::vector operators; + std::vector peft_operators; std::vector parameters; FFHandler handlers[MAX_NUM_WORKERS]; Legion::Future current_metrics; diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index e82243fd67..acbd9c3c67 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -12,12 +12,12 @@ namespace FlexFlow { class LoraLinearConfig { public: - static const LoraLinearConfig DefaultConfig; + static const LoraLinearConfig EmptyConfig; LoraLinearConfig(); LoraLinearConfig(int rank, OptimizerType type = OPTIMIZER_TYPE_SGD, float learning_rate = 1e-4); - LoraLinearConfig(std::string const &cache_folder_, + LoraLinearConfig(std::string const &config_folder_, std::string const &peft_model_id_); friend bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs); @@ -28,11 +28,12 @@ class LoraLinearConfig { int rank; OptimizerType optimizer_type; float learning_rate; - std::string cache_folder; + std::string config_folder; // Huggingface std::string peft_model_id; int lora_alpha; float lora_dropout; + std::vector target_modules; // whether to load weights from file, instead of initializing them randomly bool load_weights_from_file; }; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index d376c3e39c..502aa7fc6c 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -44,6 +44,7 @@ void parse_input_args(char **argv, bool &use_full_precision, bool &verbose, bool &do_sample, + bool &enable_peft, float &temperature, float &topp, int &max_requests_per_batch, @@ -58,6 +59,10 @@ void parse_input_args(char **argv, } continue; } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } if (!strcmp(argv[i], "-peft-model")) { peft_model_name = std::string(argv[++i]); for (char &c : peft_model_name) { @@ -137,6 +142,7 @@ void FlexFlow::top_level_task(Task const *task, bool use_full_precision = false; bool verbose = false; bool do_sample = false; + bool enable_peft = false; float temperature = 0.0f; float topp = 0.0f; int max_requests_per_batch = 8; @@ -178,6 +184,14 @@ void FlexFlow::top_level_task(Task const *task, << std::endl; assert(false); } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + json model_config = json::parse(config_file_handle, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, @@ -212,6 +226,9 @@ void FlexFlow::top_level_task(Task const *task, assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); + // load PEFT config + LoraLinearConfig peft_config = peft_model_name.empty() ? LoraLinearConfig::EmptyConfig : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + GenerationConfig generationConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); rm->set_max_requests_per_batch(max_requests_per_batch); @@ -259,17 +276,11 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } - // Register PEFT layer - LoraLinearConfig mlp_second = - peft_model_name.empty() - ? LoraLinearConfig::DefaultConfig - : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); - PEFTModelID peft_model_id = - peft_model_name.empty() - ? PEFTModelID::NO_ID - : model.register_peft_model( - LoraLinearConfig::DefaultConfig /*mlp_first*/, - mlp_second /*mlp_second*/); + // Add PEFT layer + PEFTModelID peft_model_id = PEFTModelID::NO_ID; + if (!peft_model_name.empty()) { + peft_model_id = model.register_peft_model(peft_config); + } // Start background server rm->start_background_server(&model); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index a7a1758cc3..fd788fa904 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -224,12 +224,7 @@ void LLAMA::create_llama_model(FFModel &ff, 0.0f, std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str()); // Low-Rank Adapter (LoRA) for the second linear layer - ff.lora_linear( - multi, - w2, - OP_LORA_MLP_SECOND, - std::string("layers." + std::to_string(i) + ".mlp.down_proj.lora") - .c_str()); + // ff.lora_linear(std::string("down_proj"), std::string("layers." + std::to_string(i) + ".mlp.down_proj.lora").c_str()); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 6d04ba47f2..bc22e1a8b7 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -220,11 +220,7 @@ void OPT::create_opt_model(FFModel &ff, 0.0f, std::string("layers." + std::to_string(i) + ".fc2").c_str()); // Low-Rank Adapter (LoRA) for the second linear layer - ff.lora_linear( - fc1, - fc2, - OP_LORA_MLP_SECOND, - std::string("layers." + std::to_string(i) + ".fc2.lora").c_str()); + // ff.lora_linear(std::string("fc2"), std::string("layers." + std::to_string(i) + ".fc2.lora").c_str()); } // final diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index f052b21033..7e63b5055c 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -722,7 +722,7 @@ def compile( max_tokens_per_batch, ) - # TODO: add linear layers + # TODO: add peft layers # Download the weights from huggingface (if needed) self.download_hf_weights_if_needed() diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 574fbcb573..aca93a973d 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -266,8 +266,7 @@ __host__ void batch_size); break; } - case OP_LORA_MLP_FIRST: - case OP_LORA_MLP_SECOND: { + case OP_LORA: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 1); Domain input_domain = my_input_accessor[0].domain; @@ -910,8 +909,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, num_peft_tokens); break; } - case OP_LORA_MLP_FIRST: - case OP_LORA_MLP_SECOND: { + case OP_LORA: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 1); Domain input_domain = my_input_grad_accessor[0].domain; diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 366eca27b7..906bb91b6c 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -38,14 +38,36 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::LoraLinear; +void FFModel::add_lora_layer(std::string target_module_name, + char const *name) { + assert(target_module_name.length() > 0 && "LoRA target module name is empty"); + + // find target layer, and ensure uniqueness. + // if the target layer already has a LoRA layer, no need to add it again (keep track of layers with lora) + Layer *target_module = nullptr; + for (Layer *it : layers) { + if (it->op_type == OP_LINEAR && it->name != nullptr && strlen(it->name) > 0) { + std::string s(it->name); + if (s.find(target_module_name) != string::npos) { + // Check that this is the only layer with target name + if (target_module != nullptr) { + fprintf(stderr, "Error, found two layers containing LoRA target module name '%s'. Layer 1: %s, Layer 2: %s\n", + target_module_name.c_str(), target_module->name, it->name); + } + target_module = it; + } + } + } + lora_linear(target_module->inputs[0], target_module->outputs[0], name); +} + void FFModel::lora_linear(Tensor const input, Tensor const output, - OperatorType op_type, char const *name) { assert(input->data_type == output->data_type); Layer *lora = nullptr; lora = new Layer(this, - op_type, + OP_LORA, output->data_type, name, 2 /*inputs*/, diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 9d797aaed2..0edeb03d2f 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -5,24 +5,24 @@ using json = nlohmann::json; namespace FlexFlow { -const LoraLinearConfig LoraLinearConfig::DefaultConfig = LoraLinearConfig(); +const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig(); LoraLinearConfig::LoraLinearConfig() : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f), - cache_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f), + config_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f), load_weights_from_file(false) {} LoraLinearConfig::LoraLinearConfig(int _rank, OptimizerType _type, float _lr) - : rank(_rank), optimizer_type(_type), learning_rate(_lr), cache_folder(""), + : rank(_rank), optimizer_type(_type), learning_rate(_lr), config_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f), load_weights_from_file(false) {} -LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_, +LoraLinearConfig::LoraLinearConfig(std::string const &config_folder_, std::string const &peft_model_id_) { - cache_folder = cache_folder_; + config_folder = config_folder_; peft_model_id = peft_model_id_; std::string peft_inference_config_file_path = - join_path({cache_folder, "configs", peft_model_id, "config.json"}); + join_path({config_folder, peft_model_id, "config.json"}); std::ifstream config_file(peft_inference_config_file_path); if (config_file.is_open()) { try { @@ -31,6 +31,9 @@ LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_, rank = model_config["r"]; lora_alpha = model_config["lora_alpha"]; lora_dropout = model_config["lora_dropout"]; + for (auto &s : model_config["target_modules"]) { + target_modules.push_back(s); + } } catch (json::exception const &e) { std::cerr << "Error parsing PEFT config from JSON file: " << e.what() << std::endl; @@ -48,21 +51,37 @@ LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_, bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) { if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type && - lhs.learning_rate == rhs.learning_rate) { + lhs.learning_rate == rhs.learning_rate && lhs.config_folder == rhs.config_folder && + lhs.peft_model_id == rhs.peft_model_id && lhs.lora_alpha == rhs.lora_alpha && + lhs.lora_dropout == rhs.lora_dropout && lhs.target_modules.size() == rhs.target_modules.size() && + lhs.load_weights_from_file == rhs.load_weights_from_file) { + for (int i=0; iop_type == OP_LORA_MLP_FIRST || l->op_type == OP_LORA_MLP_SECOND) { + if (l->op_type == OP_LORA) { continue; } switch (weight->data_type) { diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 31cf3bb6a7..dae0021bb6 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2764,8 +2764,7 @@ void FFModel::deserialize_graph_optimal_view( node = Linear::deserialize(*this, dez, inputs, num_inputs); break; } - case OP_LORA_MLP_FIRST: - case OP_LORA_MLP_SECOND: { + case OP_LORA: { node = LoraLinear::deserialize(*this, dez, inputs, num_inputs); break; } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index a64fb8ec9c..92340a92db 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3308,10 +3308,10 @@ Op *FFModel::create_operator_from_layer( return op; } // PEFT layers - case OP_LORA_MLP_FIRST: - case OP_LORA_MLP_SECOND: { + case OP_LORA: { Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); + peft_operators.push_back(op); return op; } default: diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 41c371d4e2..3d71fa1e6b 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2488,25 +2488,25 @@ std::string find_layer_name_from_guid(FFModel *model, LayerID guid) { bool is_peft_operator_type(OperatorType type) { switch (type) { - case OP_LORA_MLP_FIRST: - case OP_LORA_MLP_SECOND: + case OP_LORA: return true; default: return false; } } -PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first, - LoraLinearConfig const mlp_second) { - if (!(mlp_first == LoraLinearConfig::DefaultConfig && - mlp_second == LoraLinearConfig::DefaultConfig)) { - if (!config.enable_peft) { - fprintf(stderr, - "Error: trying to register PEFT model, but peft mode is not " - "enabled.\n"); - assert(false); - } +PEFTModelID FFModel::register_peft_model(LoraLinearConfig const peft_config) { + if (peft_config == LoraLinearConfig::EmptyConfig) { + fprintf(stderr, "Error: trying to register empty PEFT model\n"); + assert(false); + } + if (!config.enable_peft) { + fprintf(stderr, + "Error: trying to register PEFT model, but peft mode is not " + "enabled.\n"); + assert(false); } + PEFTModelID peft_model_id(peft_model_global_guid++); InferenceManager *im = InferenceManager::get_inference_manager(); std::vector peft_operators; @@ -2526,7 +2526,7 @@ PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first, std::string layer_name = find_layer_name_from_guid(this, peft_operators[op]->layer_guid); switch (peft_operators[op]->op_type) { - case OP_LORA_MLP_FIRST: { + case OP_LORA: { if (mlp_first == LoraLinearConfig::DefaultConfig) { // Do nothing for the default configuration continue; From 50bee5418f2b5c5f52fd633e6703d6ca54a61154 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 23 Feb 2024 22:04:28 +0000 Subject: [PATCH 08/32] update --- include/flexflow/model.h | 12 ++- include/flexflow/ops/lora_linear.h | 3 + include/flexflow/ops/lora_linear_params.h | 1 + inference/incr_decoding/incr_decoding.cc | 2 +- src/ops/lora_linear.cc | 119 ++++++++++++++++------ 5 files changed, 99 insertions(+), 38 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 2dd0dbf686..85b72505f6 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -837,10 +837,7 @@ class FFModel { // ======================================== // PEFT Layers // ======================================== - void add_lora_layer(std::string target_module_name, char const *name); - void lora_linear(Tensor const input, - Tensor const output, - char const *name = nullptr); + PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config); // ======================================== // Inference APIs // ======================================== @@ -1171,8 +1168,13 @@ class FFModel { std::vector layers; std::vector operators; - std::vector peft_operators; std::vector parameters; + // PEFT related + std::unordered_map base_layer_to_peft_layer; + std::unordered_map> peft_layer_to_peft_id; + std::unordered_map peft_configs; +// std::vector peft_operators; + FFHandler handlers[MAX_NUM_WORKERS]; Legion::Future current_metrics; // Cached operators: key: operator hash, value: operator pointer diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index b9aabdd1aa..0aa14f9d39 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -22,6 +22,7 @@ class LoraLinear : public Op { OperatorType type, ParallelTensor const input, ParallelTensor const output, + std::unordered_map const &_peft_configs, char const *name = nullptr); LoraLinear(FFModel &model, LoraLinear const &other, @@ -98,6 +99,8 @@ class LoraLinear : public Op { int num_inputs) const override; // size_t get_params_hash() const override; LoraLinearParams get_params() const; + + std::unordered_map peft_configs; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index acbd9c3c67..dfc78d0683 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -42,6 +42,7 @@ class LoraLinearParams { public: LayerID layer_guid; OperatorType type; + std::unordered_map peft_configs; char name[MAX_OPNAME]; bool is_valid(std::pair const diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 502aa7fc6c..6d1af3c17c 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -279,7 +279,7 @@ void FlexFlow::top_level_task(Task const *task, // Add PEFT layer PEFTModelID peft_model_id = PEFTModelID::NO_ID; if (!peft_model_name.empty()) { - peft_model_id = model.register_peft_model(peft_config); + peft_model_id = model.add_lora_layer(peft_config); } // Start background server diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 906bb91b6c..5281e0df65 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -38,13 +38,14 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::LoraLinear; -void FFModel::add_lora_layer(std::string target_module_name, - char const *name) { +PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { + assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); assert(target_module_name.length() > 0 && "LoRA target module name is empty"); - + // find target layer, and ensure uniqueness. // if the target layer already has a LoRA layer, no need to add it again (keep track of layers with lora) Layer *target_module = nullptr; + int idx; for (Layer *it : layers) { if (it->op_type == OP_LINEAR && it->name != nullptr && strlen(it->name) > 0) { std::string s(it->name); @@ -53,50 +54,69 @@ void FFModel::add_lora_layer(std::string target_module_name, if (target_module != nullptr) { fprintf(stderr, "Error, found two layers containing LoRA target module name '%s'. Layer 1: %s, Layer 2: %s\n", target_module_name.c_str(), target_module->name, it->name); + assert(false); } target_module = it; } } + idx++; } - lora_linear(target_module->inputs[0], target_module->outputs[0], name); -} - -void FFModel::lora_linear(Tensor const input, - Tensor const output, - char const *name) { - assert(input->data_type == output->data_type); - Layer *lora = nullptr; - lora = new Layer(this, - OP_LORA, - output->data_type, - name, - 2 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input, - output); - { - int numdims = output->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = output->dims[i]; + PEFTModelID peft_model_id(peft_model_global_guid++); + peft_configs[peft_model_id] = peft_config; + + Layer *peft_layer = nullptr; + if (base_layer_to_peft_layer.find(target_module) != base_layer_to_peft_layer.end()) { + // lora linear layer already added, no need to add again + peft_layer = base_layer_to_peft_layer[target_module]; + peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); + } else { + Tensor const input = target_module->inputs[0]; + Tensor const output = target_module->outputs[0]; + assert(input->data_type == output->data_type); + std::string name_ = target_module->name + ".lora"; + Layer *peft_layer = new Layer(this, + OP_LORA, + output->data_type, + name.c_str(), + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input, + output); + { + int numdims = output->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = output->dims[i]; + } + peft_layer->outputs[0] = create_tensor_legion_ordering( + numdims, dims, output->data_type, peft_layer, 0, true /*create_grad*/); } - lora->outputs[0] = create_tensor_legion_ordering( - numdims, dims, output->data_type, lora, 0, true /*create_grad*/); + layers.insert(layers.begin() + idx + 1, peft_layer); + + base_layer_to_peft_layer[target_module] = peft_layer; + peft_layer_to_peft_id[peft_layer] = std::vector(); + peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); } - layers.push_back(lora); + return peft_model_id; } Op *LoraLinear::create_operator_from_layer( FFModel &model, Layer const *layer, std::vector const &inputs) { + std::unordered_map _peft_configs, + std::vector const &peft_ids = model.peft_layer_to_peft_id[layer]; + for (int i=0; ilayer_guid, layer->op_type, inputs[0], inputs[1], - layer->name); + _peft_configs, + layer->name);; } LoraLinear::LoraLinear(FFModel &model, @@ -104,7 +124,7 @@ LoraLinear::LoraLinear(FFModel &model, ParallelTensor const input, ParallelTensor const output) : LoraLinear( - model, other.layer_guid, other.op_type, input, output, other.name) {} + model, other.layer_guid, other.op_type, input, output, other.peft_configs, other.name) {} LoraLinear::LoraLinear(FFModel &model, Params const ¶ms, @@ -115,6 +135,7 @@ LoraLinear::LoraLinear(FFModel &model, params.type, inputs.first, inputs.second, + params.peft_configs, params.name) {} LoraLinear::LoraLinear(FFModel &model, @@ -122,6 +143,7 @@ LoraLinear::LoraLinear(FFModel &model, OperatorType _op_type, ParallelTensor const _input, ParallelTensor const _output, + std::unordered_map _peft_configs, char const *name) : Op(model, _op_type, @@ -151,6 +173,9 @@ LoraLinear::LoraLinear(FFModel &model, outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, inputs[1]->data_type, this); } + for (const auto& kv : _peft_configs) { + peft_configs.insert(kv); + } // assert(check_output_input_weight_parallel_dims(allocate_weights)); } @@ -783,7 +808,16 @@ bool LoraLinear::measure_operator_cost(Simulator *sim, } bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type; + if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && lhs.peft_configs.size() == rhs.peft_configs.size()) { + for (const auto& kv : lhs.peft_configs) { + auto it = rhs.peft_configs.find(kv.first); + if (it == rhs.peft_configs.end() || it->second != kv.second) { + return false; + } + } + return true; + } + return false; } void LoraLinear::serialize(Legion::Serializer &sez) const { @@ -791,6 +825,11 @@ void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); sez.serialize(this->op_type); + sez.serialize(this->peft_configs.size()); + for (const auto& kv : this->peft_configs) { + sez.serialize(kv.first); + sez.serialize(kv.second); + } sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -804,17 +843,28 @@ Node LoraLinear::deserialize(FFModel &ff, assert(num_inputs == 2); size_t id, transformer_layer_id, deserialized_model_id; OperatorType op_type; + size_t num_pefts; + PEFTModelID peft_model_id; + LoraLinearConfig peft_config; size_t name_len; char name[MAX_OPNAME] = {0}; + + LoraLinearParams params; + dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); dez.deserialize(op_type); + dez.deserialize(num_pefts); + for (int i=0; iname != nullptr) { strcpy(params.name, this->name); } + params.peft_configs = this->peft_configs; return params; } @@ -853,6 +904,10 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.layer_guid.transformer_layer_id); hash_combine(key, params.layer_guid.model_id); + for (const auto& kv : params.peft_configs) { + hash_combine(key, kv.first); + hash_combine(key, kv.second); + } return key; } }; // namespace std From d2ad61a8e43d9f9f27e1263db2ec060a2a8b4817 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 23 Feb 2024 22:32:31 +0000 Subject: [PATCH 09/32] . --- include/flexflow/model.h | 2 - include/flexflow/ops/lora_linear.h | 10 -- src/ops/lora_linear.cc | 244 ++++++++++++----------------- src/runtime/model.cc | 16 -- src/runtime/request_manager.cc | 91 ----------- 5 files changed, 97 insertions(+), 266 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 85b72505f6..cae888784c 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -843,8 +843,6 @@ class FFModel { // ======================================== std::vector generate(std::vector const &requests); - PEFTModelID register_peft_model(LoraLinearConfig const peft_config); - Tensor create_tensor_legion_ordering(int num_dim, int const dims[], DataType data_type, diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index 0aa14f9d39..579d6f06a8 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -40,11 +40,6 @@ class LoraLinear : public Op { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void register_peft_model(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - PEFTModelID const &model_id, - LoraLinearConfig const lora_config); Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &, std::vector const &, @@ -65,11 +60,6 @@ class LoraLinear : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void - register_model_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); static void inference_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 5281e0df65..2a9f83e11d 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -266,6 +266,103 @@ OpMeta *LoraLinear::init_task(Task const *task, std::strcpy(m->op_name, lora->name); m->layer_guid = lora->layer_guid; + int shard_id = task->index_point.point_data[0]; + int num_dims = lora->inputs[0]->num_dims; + int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree; + int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree; + + DataType dt = m->input_type[0]; + assert(dt == m->input_type[1]); + assert(dt == m->output_type[0]); + assert(dt == lora->inputs[0]->data_type); + assert(dt == lora->inputs[1]->data_type); + assert(dt == lora->outputs[0]->data_type); + + // get layer name + assert(lora->name != nullptr && "Layer name is not set, cannot determine weights location"); + std::string lora_layername = std::string(lora->name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + + for (const auto& kv : lora->peft_configs) { + PEFTModelID &model_id = kv.first; + LoraLinearConfig &lora_config = kv.second; + + int rank = lora_config.rank; + + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + + LoraLinearWeight weight; + weight.in_dim = in_dim; + weight.out_dim = out_dim; + weight.rank = rank; + PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; + weight.w0_ptr = allocator->allocate_local_weights_untyped(model_id, w0_num_elements * data_type_size(dt)); + weight.w1_ptr = allocator->allocate_local_weights_untyped(model_id, w1_num_elements * data_type_size(dt)); + + // load weights from file + std::string weights_folder_filepath = join_path({ + lora_config.cache_folder, + "weights", + lora_config.peft_model_id, + dt == DT_FLOAT ? "full-precision" : "half-precision", + }); + std::string w0_filepath = + join_path({weights_folder_filepath, lora_layername_substr + "_A_weight"}); + std::string w1_filepath = + join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"}); + if (dt == DT_FLOAT) { + std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" + << ", size: " << w0_num_elements << ", shard: " << shard_id + << std::endl; + load_peft_from_file( + (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); + std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" + << ", size: " << w1_num_elements << ", shard: " << shard_id + << std::endl; + load_peft_from_file( + (float *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath); + } else if (dt == DT_HALF) { + std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" + << ", size: " << w0_num_elements << ", shard: " << shard_id + << std::endl; + load_peft_from_file( + (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); + std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" + << ", size: " << w1_num_elements << ", shard: " << shard_id + << std::endl; + load_peft_from_file( + (half *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath); + } else { + assert(false && "Data type not supported"); + } + + if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { + // Input is partitioned (no replication) + // w0_grad is local weight gradients + weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(model_id, w0_num_elements * data_type_size(dt)); + // w1_grad is sync weight gradients + weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(model_id, w1_num_elements * data_type_size(dt)); + } else { + // Input is replicated + // w0_grad is sync weight gradients + weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(model_id, w0_num_elements * data_type_size(dt)); + // w1_grad is local weight gradients + weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(model_id, w1_num_elements * data_type_size(dt)); + } + assert(m->model_weights.find(model_id) == m->model_weights.end()); + m->model_weights[model_id] = weight; + } + return m; } @@ -275,45 +372,6 @@ struct LoraLinearRegisterInfo { LoraLinearConfig lora_config; }; -void LoraLinear::register_peft_model( - FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - PEFTModelID const &model_id, - LoraLinearConfig const lora_config) { - assert(check_output_input_weight_same_parallel_is()); - assert(batch_inputs.size() == 2); - assert(batch_outputs.size() == 1); - // Assert that the output and the second input are mapped to the same - // region/part - assert(batch_outputs[0]->region == batch_inputs[1]->region); - assert(batch_outputs[0]->part == batch_inputs[1]->part); - // assert(check_output_input_weight_same_machine_view()); - // output is considered as an input to allow in-place optimization - ParallelTensor output_tensor = batch_outputs[0]; - parallel_is = output_tensor->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - MachineView const *view = &output_tensor->machine_view; - size_t machine_view_hash = view->hash(); - set_argumentmap_for_inference(ff, argmap, output_tensor); - LoraLinearRegisterInfo info; - info.lora = this; - info.model_id = model_id; - info.lora_config = lora_config; - IndexLauncher launcher(LORA_LINEAR_REG_TASK_ID, - parallel_is, - TaskArgument(&info, sizeof(LoraLinearRegisterInfo)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); -} - template void load_peft_from_file( DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) { @@ -340,114 +398,6 @@ void load_peft_from_file( in.close(); } -void LoraLinear::register_model_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - LoraLinearRegisterInfo const *info = - static_cast(task->args); - LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); - LoraLinear const *lora = info->lora; - - int shard_id = task->index_point.point_data[0]; - - int rank = info->lora_config.rank; - int num_dims = lora->inputs[0]->num_dims; - int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree; - int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree; - int w0_num_elements = rank * in_dim; - int w1_num_elements = rank * out_dim; - - DataType dt = m->input_type[0]; - assert(dt == m->input_type[1]); - assert(dt == m->output_type[0]); - assert(dt == lora->inputs[0]->data_type); - assert(dt == lora->inputs[1]->data_type); - assert(dt == lora->outputs[0]->data_type); - assert(m->model_weights.find(info->model_id) == m->model_weights.end()); - - LoraLinearWeight weight; - weight.in_dim = in_dim; - weight.out_dim = out_dim; - weight.rank = rank; - PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; - weight.w0_ptr = allocator->allocate_local_weights_untyped( - info->model_id, w0_num_elements * data_type_size(dt)); - weight.w1_ptr = allocator->allocate_local_weights_untyped( - info->model_id, w1_num_elements * data_type_size(dt)); - - // get layer name - assert(lora->name != nullptr && - "Layer name is not set, cannot determine weights location"); - std::string lora_layername = std::string(lora->name); - std::string searchString = "lora"; - size_t found = lora_layername.find(searchString); - if (found == std::string::npos) { - std::cout << "LoraLinear layer name not in the right format (does not " - "contain word 'lora')" - << std::endl; - assert(false); - } - std::string lora_layername_substr = - lora_layername.substr(0, found + searchString.length()); - - // load weights from file - std::string weights_folder_filepath = join_path({ - info->lora_config.cache_folder, - "weights", - info->lora_config.peft_model_id, - dt == DT_FLOAT ? "full-precision" : "half-precision", - }); - std::string w0_filepath = - join_path({weights_folder_filepath, lora_layername_substr + "_A_weight"}); - std::string w1_filepath = - join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"}); - if (dt == DT_FLOAT) { - std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" - << ", size: " << w0_num_elements << ", shard: " << shard_id - << std::endl; - load_peft_from_file( - (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); - std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" - << ", size: " << w1_num_elements << ", shard: " << shard_id - << std::endl; - load_peft_from_file( - (float *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath); - } else if (dt == DT_HALF) { - std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" - << ", size: " << w0_num_elements << ", shard: " << shard_id - << std::endl; - load_peft_from_file( - (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); - std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" - << ", size: " << w1_num_elements << ", shard: " << shard_id - << std::endl; - load_peft_from_file( - (half *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath); - } else { - assert(false && "Data type not supported"); - } - - if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { - // Input is partitioned (no replication) - // w0_grad is local weight gradients - weight.w0_grad_ptr = allocator->allocate_local_weights_untyped( - info->model_id, w0_num_elements * data_type_size(dt)); - // w1_grad is sync weight gradients - weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped( - info->model_id, w1_num_elements * data_type_size(dt)); - } else { - // Input is replicated - // w0_grad is sync weight gradients - weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped( - info->model_id, w0_num_elements * data_type_size(dt)); - // w1_grad is local weight gradients - weight.w1_grad_ptr = allocator->allocate_local_weights_untyped( - info->model_id, w1_num_elements * data_type_size(dt)); - } - m->model_weights[info->model_id] = weight; -} - void LoraLinear::forward(FFModel const &ff) { assert(false && "LoraLinear does not support normal init"); } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 92340a92db..ed5581ddd1 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -6697,22 +6697,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } - { - TaskVariantRegistrar registrar(LORA_LINEAR_REG_TASK_ID, - "LoraLinear Model Registration"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant( - registrar, "LoraLinear Model Registration Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant( - registrar); - } - } { TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID, "LoraLinear Inference"); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 3d71fa1e6b..e9e21df52a 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2495,97 +2495,6 @@ bool is_peft_operator_type(OperatorType type) { } } -PEFTModelID FFModel::register_peft_model(LoraLinearConfig const peft_config) { - if (peft_config == LoraLinearConfig::EmptyConfig) { - fprintf(stderr, "Error: trying to register empty PEFT model\n"); - assert(false); - } - if (!config.enable_peft) { - fprintf(stderr, - "Error: trying to register PEFT model, but peft mode is not " - "enabled.\n"); - assert(false); - } - - PEFTModelID peft_model_id(peft_model_global_guid++); - InferenceManager *im = InferenceManager::get_inference_manager(); - std::vector peft_operators; - for (size_t op = 0; op < operators.size(); op++) { - if (is_peft_operator_type(operators[op]->op_type)) { - peft_operators.push_back(operators[op]); - } else if (operators[op]->op_type == OP_FUSED) { - FusedOp *fused = static_cast(operators[op]); - for (size_t op2 = 0; op2 < fused->numOperators; op2++) { - if (is_peft_operator_type(fused->operators[op2]->op_type)) { - peft_operators.push_back(fused->operators[op2]); - } - } - } - } - for (size_t op = 0; op < peft_operators.size(); op++) { - std::string layer_name = - find_layer_name_from_guid(this, peft_operators[op]->layer_guid); - switch (peft_operators[op]->op_type) { - case OP_LORA: { - if (mlp_first == LoraLinearConfig::DefaultConfig) { - // Do nothing for the default configuration - continue; - } - LoraLinear *lora = static_cast(peft_operators[op]); - // Currently assume only a single data pipeline - assert(config.data_parallelism_degree == 1); - std::vector inputs(lora->numInputs); - std::vector outputs(lora->numOutputs); - - for (int i = 0; i < lora->numInputs; i++) { - assert(im->tensor_buffer.find(lora->inputs[i]) != - im->tensor_buffer.end()); - assert(lora->inputs[i] != nullptr); - assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(im->tensor_buffer[lora->inputs[i]].size() == 1); - inputs[i] = im->tensor_buffer[lora->inputs[i]][0]; - assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); - } - assert(lora->numOutputs == 1); - outputs[0] = inputs[1]; - lora->register_peft_model( - *this, inputs, outputs, peft_model_id, mlp_first); - break; - } - case OP_LORA_MLP_SECOND: { - if (mlp_second == LoraLinearConfig::DefaultConfig) { - // Do nothing for the default configuration - continue; - } - LoraLinear *lora = static_cast(peft_operators[op]); - // Currently assume only a single data pipeline - assert(config.data_parallelism_degree == 1); - std::vector inputs(lora->numInputs); - std::vector outputs(lora->numOutputs); - - for (int i = 0; i < lora->numInputs; i++) { - assert(im->tensor_buffer.find(lora->inputs[i]) != - im->tensor_buffer.end()); - assert(lora->inputs[i] != nullptr); - assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(im->tensor_buffer[lora->inputs[i]].size() == 1); - inputs[i] = im->tensor_buffer[lora->inputs[i]][0]; - assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); - } - assert(lora->numOutputs == 1); - outputs[0] = inputs[1]; - lora->register_peft_model( - *this, inputs, outputs, peft_model_id, mlp_second); - break; - } - default: { - assert(false && "Unsupported PEFT Operator type"); - } - } - } - return peft_model_id; -} - /*static*/ void RequestManager::serve_incr_decoding(FFModel *llm) { Context ctx = llm->config.lg_ctx; From b6539aaa6f5d50ff7f55c78d4abceb0a4af0d129 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 23 Feb 2024 23:17:26 +0000 Subject: [PATCH 10/32] fixes --- include/flexflow/model.h | 2 +- src/ops/lora_linear.cc | 185 +++++++++++++++++----------------- src/ops/lora_linear_params.cc | 6 +- 3 files changed, 96 insertions(+), 97 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index cae888784c..36aaec30bc 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -837,7 +837,7 @@ class FFModel { // ======================================== // PEFT Layers // ======================================== - PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config); + PEFTModelID add_lora_layer(LoraLinearConfig const peft_config); // ======================================== // Inference APIs // ======================================== diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 2a9f83e11d..3f8bdb98ba 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -40,64 +40,70 @@ using namespace FlexFlow::Kernels::LoraLinear; PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); - assert(target_module_name.length() > 0 && "LoRA target module name is empty"); - - // find target layer, and ensure uniqueness. - // if the target layer already has a LoRA layer, no need to add it again (keep track of layers with lora) - Layer *target_module = nullptr; - int idx; - for (Layer *it : layers) { - if (it->op_type == OP_LINEAR && it->name != nullptr && strlen(it->name) > 0) { - std::string s(it->name); - if (s.find(target_module_name) != string::npos) { - // Check that this is the only layer with target name - if (target_module != nullptr) { - fprintf(stderr, "Error, found two layers containing LoRA target module name '%s'. Layer 1: %s, Layer 2: %s\n", - target_module_name.c_str(), target_module->name, it->name); - assert(false); - } - target_module = it; - } - } - idx++; + if (peft_config.target_modules.size() == 0) { + printf("PEFT config does not contain any target module\n"); + return PEFTModelID::NO_ID; } PEFTModelID peft_model_id(peft_model_global_guid++); peft_configs[peft_model_id] = peft_config; - Layer *peft_layer = nullptr; - if (base_layer_to_peft_layer.find(target_module) != base_layer_to_peft_layer.end()) { - // lora linear layer already added, no need to add again - peft_layer = base_layer_to_peft_layer[target_module]; - peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); - } else { - Tensor const input = target_module->inputs[0]; - Tensor const output = target_module->outputs[0]; - assert(input->data_type == output->data_type); - std::string name_ = target_module->name + ".lora"; - Layer *peft_layer = new Layer(this, - OP_LORA, - output->data_type, - name.c_str(), - 2 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input, - output); - { - int numdims = output->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = output->dims[i]; + for (std::string target_module_name : peft_config.target_modules) { + assert(target_module_name.length() > 0 && "LoRA target module name is empty"); + // find target layer, and ensure uniqueness. + // if the target layer already has a LoRA layer, no need to add it again (keep track of layers with lora) + Layer *target_module = nullptr; + int idx; + for (Layer *it : layers) { + if (it->op_type == OP_LINEAR && it->name != nullptr && strlen(it->name) > 0) { + std::string s(it->name); + if (s.find(target_module_name) != std::string::npos) { + // Check that this is the only layer with target name + if (target_module != nullptr) { + fprintf(stderr, "Error, found two layers containing LoRA target module name '%s'. Layer 1: %s, Layer 2: %s\n", + target_module_name.c_str(), target_module->name, it->name); + assert(false); + } + target_module = it; + } } - peft_layer->outputs[0] = create_tensor_legion_ordering( - numdims, dims, output->data_type, peft_layer, 0, true /*create_grad*/); + idx++; + } + Layer *peft_layer = nullptr; + if (base_layer_to_peft_layer.find(target_module) != base_layer_to_peft_layer.end()) { + // lora linear layer already added, no need to add again + peft_layer = base_layer_to_peft_layer[target_module]; + peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); + } else { + Tensor const input = target_module->inputs[0]; + Tensor const output = target_module->outputs[0]; + assert(input->data_type == output->data_type); + std::string name_ = target_module->name ? std::string(target_module->name) : std::string(""); + name_ += ".lora"; + Layer *peft_layer = new Layer(this, + OP_LORA, + output->data_type, + name_.c_str(), + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input, + output); + { + int numdims = output->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = output->dims[i]; + } + peft_layer->outputs[0] = create_tensor_legion_ordering( + numdims, dims, output->data_type, peft_layer, 0, true /*create_grad*/); + } + layers.insert(layers.begin() + idx + 1, peft_layer); + base_layer_to_peft_layer[target_module] = peft_layer; + peft_layer_to_peft_id[peft_layer] = std::vector(); + peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); } - layers.insert(layers.begin() + idx + 1, peft_layer); - - base_layer_to_peft_layer[target_module] = peft_layer; - peft_layer_to_peft_id[peft_layer] = std::vector(); - peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); } + return peft_model_id; } @@ -105,8 +111,8 @@ Op *LoraLinear::create_operator_from_layer( FFModel &model, Layer const *layer, std::vector const &inputs) { - std::unordered_map _peft_configs, - std::vector const &peft_ids = model.peft_layer_to_peft_id[layer]; + std::unordered_map _peft_configs; + std::vector const &peft_ids = model.peft_layer_to_peft_id[(Layer*)layer]; for (int i=0; i _peft_configs, + std::unordered_map const &_peft_configs, char const *name) : Op(model, _op_type, @@ -230,6 +236,32 @@ void LoraLinear::init_inference( set_opmeta_from_futuremap_inference(ff, fm, output_tensor); } +template +void load_peft_from_file( + DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + printf("Could not open file: %s\n", filepath.c_str()); + } + assert(in.good() && "incorrect weight file path"); + std::vector
host_array(size); + size_t target_data_size = sizeof(DT) * size; + in.seekg(sharded * shard_id * target_data_size, in.beg); + in.read((char *)host_array.data(), target_data_size); + + size_t in_get_size = in.gcount(); + if (in_get_size != target_data_size) { + printf("load weight data error: %lu, %lu, %lu\n", + in_get_size, + target_data_size, + sizeof(DT)); + assert(false); + } + assert(size == host_array.size()); + copy_tensor_host_to_dev(ptr, host_array.data(), size); + in.close(); +} + /* regions[0](O): output regions[1](I): kernel @@ -268,8 +300,8 @@ OpMeta *LoraLinear::init_task(Task const *task, int shard_id = task->index_point.point_data[0]; int num_dims = lora->inputs[0]->num_dims; - int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree; - int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree; + assert(in_dim == lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree); + assert(out_dim == lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree); DataType dt = m->input_type[0]; assert(dt == m->input_type[1]); @@ -293,8 +325,8 @@ OpMeta *LoraLinear::init_task(Task const *task, lora_layername.substr(0, found + searchString.length()); for (const auto& kv : lora->peft_configs) { - PEFTModelID &model_id = kv.first; - LoraLinearConfig &lora_config = kv.second; + PEFTModelID const &model_id = kv.first; + LoraLinearConfig const &lora_config = kv.second; int rank = lora_config.rank; @@ -311,8 +343,7 @@ OpMeta *LoraLinear::init_task(Task const *task, // load weights from file std::string weights_folder_filepath = join_path({ - lora_config.cache_folder, - "weights", + lora_config.config_folder, lora_config.peft_model_id, dt == DT_FLOAT ? "full-precision" : "half-precision", }); @@ -366,38 +397,6 @@ OpMeta *LoraLinear::init_task(Task const *task, return m; } -struct LoraLinearRegisterInfo { - LoraLinear const *lora; - PEFTModelID model_id; - LoraLinearConfig lora_config; -}; - -template -void load_peft_from_file( - DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) { - std::ifstream in(filepath, std::ios::in | std::ios::binary); - if (!in.good()) { - printf("Could not open file: %s\n", filepath.c_str()); - } - assert(in.good() && "incorrect weight file path"); - std::vector
host_array(size); - size_t target_data_size = sizeof(DT) * size; - in.seekg(sharded * shard_id * target_data_size, in.beg); - in.read((char *)host_array.data(), target_data_size); - - size_t in_get_size = in.gcount(); - if (in_get_size != target_data_size) { - printf("load weight data error: %lu, %lu, %lu\n", - in_get_size, - target_data_size, - sizeof(DT)); - assert(false); - } - assert(size == host_array.size()); - copy_tensor_host_to_dev(ptr, host_array.data(), size); - in.close(); -} - void LoraLinear::forward(FFModel const &ff) { assert(false && "LoraLinear does not support normal init"); } @@ -761,7 +760,7 @@ bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && lhs.peft_configs.size() == rhs.peft_configs.size()) { for (const auto& kv : lhs.peft_configs) { auto it = rhs.peft_configs.find(kv.first); - if (it == rhs.peft_configs.end() || it->second != kv.second) { + if (it == rhs.peft_configs.end() || !(it->second == kv.second)) { return false; } } diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 0edeb03d2f..595743ac33 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -75,9 +75,9 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { os << "lora_alpha: " << llc.lora_alpha << ", "; os << "lora_dropout: " << llc.lora_dropout << ", "; os << "target_modules: ["; - for (int i=0; i Date: Fri, 23 Feb 2024 23:25:14 +0000 Subject: [PATCH 11/32] fix --- src/runtime/model.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index ed5581ddd1..63016d0c8b 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3311,7 +3311,6 @@ Op *FFModel::create_operator_from_layer( case OP_LORA: { Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); - peft_operators.push_back(op); return op; } default: From 484a3cb2a08caa91c8ac110b75f92f92c91a11a9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 3 Mar 2024 02:04:12 +0000 Subject: [PATCH 12/32] fix build --- include/flexflow/model.h | 8 +- include/flexflow/ops/lora_linear.h | 15 +- inference/incr_decoding/incr_decoding.cc | 8 +- inference/models/llama.cc | 3 +- inference/models/opt.cc | 3 +- src/ops/lora_linear.cc | 241 ++++++++++++++++------- src/ops/lora_linear_params.cc | 15 +- 7 files changed, 203 insertions(+), 90 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 36aaec30bc..74421ffc92 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1168,11 +1168,11 @@ class FFModel { std::vector operators; std::vector parameters; // PEFT related - std::unordered_map base_layer_to_peft_layer; - std::unordered_map> peft_layer_to_peft_id; + std::unordered_map base_layer_to_peft_layer; + std::unordered_map> peft_layer_to_peft_id; std::unordered_map peft_configs; -// std::vector peft_operators; - + // std::vector peft_operators; + FFHandler handlers[MAX_NUM_WORKERS]; Legion::Future current_metrics; // Cached operators: key: operator hash, value: operator pointer diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index 579d6f06a8..9e83c3f90e 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -17,13 +17,14 @@ class LoraLinear : public Op { using Params = LoraLinearParams; using Input = std::pair; - LoraLinear(FFModel &model, - LayerID const &layer_guid, - OperatorType type, - ParallelTensor const input, - ParallelTensor const output, - std::unordered_map const &_peft_configs, - char const *name = nullptr); + LoraLinear( + FFModel &model, + LayerID const &layer_guid, + OperatorType type, + ParallelTensor const input, + ParallelTensor const output, + std::unordered_map const &_peft_configs, + char const *name = nullptr); LoraLinear(FFModel &model, LoraLinear const &other, ParallelTensor const input, diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 6d1af3c17c..e7d4cf16fb 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -160,6 +160,7 @@ void FlexFlow::top_level_task(Task const *task, use_full_precision, verbose, do_sample, + enable_peft, temperature, topp, max_requests_per_batch, @@ -191,7 +192,7 @@ void FlexFlow::top_level_task(Task const *task, std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; assert(false); } - + json model_config = json::parse(config_file_handle, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, @@ -227,7 +228,10 @@ void FlexFlow::top_level_task(Task const *task, "Invalid LLM model type passed (or no type was passed)."); // load PEFT config - LoraLinearConfig peft_config = peft_model_name.empty() ? LoraLinearConfig::EmptyConfig : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); GenerationConfig generationConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index fd788fa904..4be232e81b 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -224,7 +224,8 @@ void LLAMA::create_llama_model(FFModel &ff, 0.0f, std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str()); // Low-Rank Adapter (LoRA) for the second linear layer - // ff.lora_linear(std::string("down_proj"), std::string("layers." + std::to_string(i) + ".mlp.down_proj.lora").c_str()); + // ff.lora_linear(std::string("down_proj"), std::string("layers." + + // std::to_string(i) + ".mlp.down_proj.lora").c_str()); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; diff --git a/inference/models/opt.cc b/inference/models/opt.cc index bc22e1a8b7..b3f2ef4e17 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -220,7 +220,8 @@ void OPT::create_opt_model(FFModel &ff, 0.0f, std::string("layers." + std::to_string(i) + ".fc2").c_str()); // Low-Rank Adapter (LoRA) for the second linear layer - // ff.lora_linear(std::string("fc2"), std::string("layers." + std::to_string(i) + ".fc2.lora").c_str()); + // ff.lora_linear(std::string("fc2"), std::string("layers." + + // std::to_string(i) + ".fc2.lora").c_str()); } // final diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 3f8bdb98ba..8a54709df6 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -39,7 +39,8 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::LoraLinear; PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { - assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); + assert(config.enable_peft && + "Cannot add a LoRA layer if PEFT mode is not enabled"); if (peft_config.target_modules.size() == 0) { printf("PEFT config does not contain any target module\n"); return PEFTModelID::NO_ID; @@ -48,19 +49,26 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { peft_configs[peft_model_id] = peft_config; for (std::string target_module_name : peft_config.target_modules) { - assert(target_module_name.length() > 0 && "LoRA target module name is empty"); + assert(target_module_name.length() > 0 && + "LoRA target module name is empty"); // find target layer, and ensure uniqueness. - // if the target layer already has a LoRA layer, no need to add it again (keep track of layers with lora) + // if the target layer already has a LoRA layer, no need to add it again + // (keep track of layers with lora) Layer *target_module = nullptr; int idx; for (Layer *it : layers) { - if (it->op_type == OP_LINEAR && it->name != nullptr && strlen(it->name) > 0) { + if (it->op_type == OP_LINEAR && it->name != nullptr && + strlen(it->name) > 0) { std::string s(it->name); if (s.find(target_module_name) != std::string::npos) { // Check that this is the only layer with target name if (target_module != nullptr) { - fprintf(stderr, "Error, found two layers containing LoRA target module name '%s'. Layer 1: %s, Layer 2: %s\n", - target_module_name.c_str(), target_module->name, it->name); + fprintf(stderr, + "Error, found two layers containing LoRA target module " + "name '%s'. Layer 1: %s, Layer 2: %s\n", + target_module_name.c_str(), + target_module->name, + it->name); assert(false); } target_module = it; @@ -69,7 +77,8 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { idx++; } Layer *peft_layer = nullptr; - if (base_layer_to_peft_layer.find(target_module) != base_layer_to_peft_layer.end()) { + if (base_layer_to_peft_layer.find(target_module) != + base_layer_to_peft_layer.end()) { // lora linear layer already added, no need to add again peft_layer = base_layer_to_peft_layer[target_module]; peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); @@ -77,25 +86,31 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { Tensor const input = target_module->inputs[0]; Tensor const output = target_module->outputs[0]; assert(input->data_type == output->data_type); - std::string name_ = target_module->name ? std::string(target_module->name) : std::string(""); + std::string name_ = target_module->name ? std::string(target_module->name) + : std::string(""); name_ += ".lora"; Layer *peft_layer = new Layer(this, - OP_LORA, - output->data_type, - name_.c_str(), - 2 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input, - output); + OP_LORA, + output->data_type, + name_.c_str(), + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input, + output); { int numdims = output->num_dims; int dims[MAX_TENSOR_DIM]; for (int i = 0; i < numdims; i++) { dims[i] = output->dims[i]; } - peft_layer->outputs[0] = create_tensor_legion_ordering( - numdims, dims, output->data_type, peft_layer, 0, true /*create_grad*/); + peft_layer->outputs[0] = + create_tensor_legion_ordering(numdims, + dims, + output->data_type, + peft_layer, + 0, + true /*create_grad*/); } layers.insert(layers.begin() + idx + 1, peft_layer); base_layer_to_peft_layer[target_module] = peft_layer; @@ -103,7 +118,7 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); } } - + return peft_model_id; } @@ -112,9 +127,11 @@ Op *LoraLinear::create_operator_from_layer( Layer const *layer, std::vector const &inputs) { std::unordered_map _peft_configs; - std::vector const &peft_ids = model.peft_layer_to_peft_id[(Layer*)layer]; - for (int i=0; i const &peft_ids = + model.peft_layer_to_peft_id[(Layer *)layer]; + for (int i = 0; i < peft_ids.size(); i++) { + _peft_configs.emplace( + std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]])); } return new LoraLinear(model, layer->layer_guid, @@ -122,15 +139,21 @@ Op *LoraLinear::create_operator_from_layer( inputs[0], inputs[1], _peft_configs, - layer->name);; + layer->name); + ; } LoraLinear::LoraLinear(FFModel &model, LoraLinear const &other, ParallelTensor const input, ParallelTensor const output) - : LoraLinear( - model, other.layer_guid, other.op_type, input, output, other.peft_configs, other.name) {} + : LoraLinear(model, + other.layer_guid, + other.op_type, + input, + output, + other.peft_configs, + other.name) {} LoraLinear::LoraLinear(FFModel &model, Params const ¶ms, @@ -144,13 +167,14 @@ LoraLinear::LoraLinear(FFModel &model, params.peft_configs, params.name) {} -LoraLinear::LoraLinear(FFModel &model, - LayerID const &_layer_guid, - OperatorType _op_type, - ParallelTensor const _input, - ParallelTensor const _output, - std::unordered_map const &_peft_configs, - char const *name) +LoraLinear::LoraLinear( + FFModel &model, + LayerID const &_layer_guid, + OperatorType _op_type, + ParallelTensor const _input, + ParallelTensor const _output, + std::unordered_map const &_peft_configs, + char const *name) : Op(model, _op_type, _output->data_type, @@ -179,8 +203,8 @@ LoraLinear::LoraLinear(FFModel &model, outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, inputs[1]->data_type, this); } - for (const auto& kv : _peft_configs) { - peft_configs.insert(kv); + for (auto const &kv : _peft_configs) { + peft_configs.insert(kv); } // assert(check_output_input_weight_parallel_dims(allocate_weights)); } @@ -300,8 +324,10 @@ OpMeta *LoraLinear::init_task(Task const *task, int shard_id = task->index_point.point_data[0]; int num_dims = lora->inputs[0]->num_dims; - assert(in_dim == lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree); - assert(out_dim == lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree); + assert(in_dim == + lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree); + assert(out_dim == + lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree); DataType dt = m->input_type[0]; assert(dt == m->input_type[1]); @@ -311,25 +337,26 @@ OpMeta *LoraLinear::init_task(Task const *task, assert(dt == lora->outputs[0]->data_type); // get layer name - assert(lora->name != nullptr && "Layer name is not set, cannot determine weights location"); + assert(lora->name != nullptr && + "Layer name is not set, cannot determine weights location"); std::string lora_layername = std::string(lora->name); std::string searchString = "lora"; size_t found = lora_layername.find(searchString); if (found == std::string::npos) { std::cout << "LoraLinear layer name not in the right format (does not " - "contain word 'lora')" + "contain word 'lora')" << std::endl; assert(false); } std::string lora_layername_substr = lora_layername.substr(0, found + searchString.length()); - for (const auto& kv : lora->peft_configs) { + for (auto const &kv : lora->peft_configs) { PEFTModelID const &model_id = kv.first; LoraLinearConfig const &lora_config = kv.second; - + int rank = lora_config.rank; - + int w0_num_elements = rank * in_dim; int w1_num_elements = rank * out_dim; @@ -338,8 +365,10 @@ OpMeta *LoraLinear::init_task(Task const *task, weight.out_dim = out_dim; weight.rank = rank; PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; - weight.w0_ptr = allocator->allocate_local_weights_untyped(model_id, w0_num_elements * data_type_size(dt)); - weight.w1_ptr = allocator->allocate_local_weights_untyped(model_id, w1_num_elements * data_type_size(dt)); + weight.w0_ptr = allocator->allocate_local_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + weight.w1_ptr = allocator->allocate_local_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); // load weights from file std::string weights_folder_filepath = join_path({ @@ -347,10 +376,10 @@ OpMeta *LoraLinear::init_task(Task const *task, lora_config.peft_model_id, dt == DT_FLOAT ? "full-precision" : "half-precision", }); - std::string w0_filepath = - join_path({weights_folder_filepath, lora_layername_substr + "_A_weight"}); - std::string w1_filepath = - join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"}); + std::string w0_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_A_weight"}); + std::string w1_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_B_weight"}); if (dt == DT_FLOAT) { std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" << ", size: " << w0_num_elements << ", shard: " << shard_id @@ -360,8 +389,11 @@ OpMeta *LoraLinear::init_task(Task const *task, std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" << ", size: " << w1_num_elements << ", shard: " << shard_id << std::endl; - load_peft_from_file( - (float *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath); + load_peft_from_file((float *)weight.w1_ptr, + w1_num_elements, + false, + shard_id, + w1_filepath); } else if (dt == DT_HALF) { std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" << ", size: " << w0_num_elements << ", shard: " << shard_id @@ -380,15 +412,19 @@ OpMeta *LoraLinear::init_task(Task const *task, if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { // Input is partitioned (no replication) // w0_grad is local weight gradients - weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(model_id, w0_num_elements * data_type_size(dt)); + weight.w0_grad_ptr = allocator->allocate_local_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); // w1_grad is sync weight gradients - weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(model_id, w1_num_elements * data_type_size(dt)); + weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); } else { // Input is replicated // w0_grad is sync weight gradients - weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(model_id, w0_num_elements * data_type_size(dt)); + weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); // w1_grad is local weight gradients - weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(model_id, w1_num_elements * data_type_size(dt)); + weight.w1_grad_ptr = allocator->allocate_local_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); } assert(m->model_weights.find(model_id) == m->model_weights.end()); m->model_weights[model_id] = weight; @@ -757,8 +793,9 @@ bool LoraLinear::measure_operator_cost(Simulator *sim, } bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { - if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && lhs.peft_configs.size() == rhs.peft_configs.size()) { - for (const auto& kv : lhs.peft_configs) { + if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && + lhs.peft_configs.size() == rhs.peft_configs.size()) { + for (auto const &kv : lhs.peft_configs) { auto it = rhs.peft_configs.find(kv.first); if (it == rhs.peft_configs.end() || !(it->second == kv.second)) { return false; @@ -775,9 +812,28 @@ void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->op_type); sez.serialize(this->peft_configs.size()); - for (const auto& kv : this->peft_configs) { - sez.serialize(kv.first); - sez.serialize(kv.second); + for (auto const &kv : this->peft_configs) { + // Serialize PEFTModelID + sez.serialize(kv.first.id); + // Serialize LoraLinearConfig + sez.serialize(kv.second.rank); + sez.serialize(kv.second.optimizer_type); + sez.serialize(kv.second.learning_rate); + sez.serialize(kv.second.config_folder.length()); + sez.serialize(kv.second.config_folder.c_str(), + kv.second.config_folder.length()); + sez.serialize(kv.second.peft_model_id.length()); + sez.serialize(kv.second.peft_model_id.c_str(), + kv.second.peft_model_id.length()); + sez.serialize(kv.second.lora_alpha); + sez.serialize(kv.second.lora_dropout); + sez.serialize(kv.second.target_modules.size()); + sez.serialize(kv.second.load_weights_from_file); + for (int i = 0; i < kv.second.target_modules.size(); i++) { + sez.serialize(kv.second.target_modules[i].length()); + sez.serialize(kv.second.target_modules[i].c_str(), + kv.second.target_modules[i].length()); + } } sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); @@ -793,22 +849,61 @@ Node LoraLinear::deserialize(FFModel &ff, size_t id, transformer_layer_id, deserialized_model_id; OperatorType op_type; size_t num_pefts; - PEFTModelID peft_model_id; - LoraLinearConfig peft_config; size_t name_len; char name[MAX_OPNAME] = {0}; - + LoraLinearParams params; - + dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); dez.deserialize(op_type); dez.deserialize(num_pefts); - for (int i=0; i::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.layer_guid.transformer_layer_id); hash_combine(key, params.layer_guid.model_id); - for (const auto& kv : params.peft_configs) { - hash_combine(key, kv.first); - hash_combine(key, kv.second); + for (auto const &kv : params.peft_configs) { + hash_combine(key, kv.first.id); + hash_combine(key, kv.second.rank); + hash_combine(key, kv.second.optimizer_type); + hash_combine(key, kv.second.learning_rate); + hash_combine(key, kv.second.config_folder); + hash_combine(key, kv.second.peft_model_id); + hash_combine(key, kv.second.lora_alpha); + hash_combine(key, kv.second.lora_dropout); + hash_combine(key, kv.second.target_modules); + hash_combine(key, kv.second.load_weights_from_file); } return key; } diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 595743ac33..771cf94906 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -32,7 +32,7 @@ LoraLinearConfig::LoraLinearConfig(std::string const &config_folder_, lora_alpha = model_config["lora_alpha"]; lora_dropout = model_config["lora_dropout"]; for (auto &s : model_config["target_modules"]) { - target_modules.push_back(s); + target_modules.push_back(s); } } catch (json::exception const &e) { std::cerr << "Error parsing PEFT config from JSON file: " << e.what() @@ -51,11 +51,14 @@ LoraLinearConfig::LoraLinearConfig(std::string const &config_folder_, bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) { if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type && - lhs.learning_rate == rhs.learning_rate && lhs.config_folder == rhs.config_folder && - lhs.peft_model_id == rhs.peft_model_id && lhs.lora_alpha == rhs.lora_alpha && - lhs.lora_dropout == rhs.lora_dropout && lhs.target_modules.size() == rhs.target_modules.size() && + lhs.learning_rate == rhs.learning_rate && + lhs.config_folder == rhs.config_folder && + lhs.peft_model_id == rhs.peft_model_id && + lhs.lora_alpha == rhs.lora_alpha && + lhs.lora_dropout == rhs.lora_dropout && + lhs.target_modules.size() == rhs.target_modules.size() && lhs.load_weights_from_file == rhs.load_weights_from_file) { - for (int i=0; i Date: Sun, 3 Mar 2024 02:14:31 +0000 Subject: [PATCH 13/32] fix --- inference/incr_decoding/incr_decoding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index e7d4cf16fb..61d56e62e1 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -231,7 +231,7 @@ void FlexFlow::top_level_task(Task const *task, LoraLinearConfig peft_config = peft_model_name.empty() ? LoraLinearConfig::EmptyConfig - : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + : LoraLinearConfig(join_path({file_paths.cache_folder_path, "configs"}), peft_model_name); GenerationConfig generationConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); From 6d41b29d99b112b3251dceb22c1700c9b85f415c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 3 Mar 2024 03:10:32 +0000 Subject: [PATCH 14/32] fix --- config/config.linux | 2 +- src/ops/lora_linear.cc | 125 +++++++++++++++++++++-------------------- 2 files changed, 65 insertions(+), 62 deletions(-) diff --git a/config/config.linux b/config/config.linux index 30edfa7dfe..4c70f95d3f 100755 --- a/config/config.linux +++ b/config/config.linux @@ -83,7 +83,7 @@ FF_MAX_DIM=${FF_MAX_DIM:-5} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY:-OFF} # set LEGION_MAX_RETURN_SIZE -LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-262144} +LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-2097152} # set ROCM path ROCM_PATH=${ROCM_PATH:-"/opt/rocm"} diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 8a54709df6..3a597ad540 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -38,6 +38,16 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::LoraLinear; +bool check_lora_layer_match(Layer *potential_target, std::string target_module_name) { + if (potential_target->op_type == OP_LINEAR && potential_target->name != nullptr && strlen(potential_target->name) > 0) { + std::string s(potential_target->name); + if (s.find(target_module_name) != std::string::npos && s.find("lora") == std::string::npos) { + return true; + } + } + return false; +} + PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); @@ -51,71 +61,64 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { for (std::string target_module_name : peft_config.target_modules) { assert(target_module_name.length() > 0 && "LoRA target module name is empty"); - // find target layer, and ensure uniqueness. - // if the target layer already has a LoRA layer, no need to add it again - // (keep track of layers with lora) - Layer *target_module = nullptr; - int idx; - for (Layer *it : layers) { - if (it->op_type == OP_LINEAR && it->name != nullptr && - strlen(it->name) > 0) { - std::string s(it->name); - if (s.find(target_module_name) != std::string::npos) { - // Check that this is the only layer with target name - if (target_module != nullptr) { - fprintf(stderr, - "Error, found two layers containing LoRA target module " - "name '%s'. Layer 1: %s, Layer 2: %s\n", - target_module_name.c_str(), - target_module->name, - it->name); - assert(false); + // find target layer + for (auto it = layers.begin(); it != layers.end(); ++it) { + Layer *target_module = *it; + bool match = check_lora_layer_match(target_module, target_module_name); + if (!match) continue; + + if (base_layer_to_peft_layer.find(target_module) != + base_layer_to_peft_layer.end()) { + // lora linear layer already added, no need to add again + Layer *peft_layer = base_layer_to_peft_layer[target_module]; + peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); + } else { + Tensor const input = target_module->inputs[0]; + Tensor const output = target_module->outputs[0]; + assert(input->data_type == output->data_type); + std::string name_ = target_module->name ? std::string(target_module->name) + : std::string(""); + size_t last_underscore = name_.length() - 1; + for (int i = name_.length() - 1; i > 0; i--) { + if (!(std::isdigit(target_module->name[i]) || target_module->name[i] == '_')) { + break; + } else if (target_module->name[i] == '_') { + last_underscore = i; } - target_module = it; } - } - idx++; - } - Layer *peft_layer = nullptr; - if (base_layer_to_peft_layer.find(target_module) != - base_layer_to_peft_layer.end()) { - // lora linear layer already added, no need to add again - peft_layer = base_layer_to_peft_layer[target_module]; - peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); - } else { - Tensor const input = target_module->inputs[0]; - Tensor const output = target_module->outputs[0]; - assert(input->data_type == output->data_type); - std::string name_ = target_module->name ? std::string(target_module->name) - : std::string(""); - name_ += ".lora"; - Layer *peft_layer = new Layer(this, - OP_LORA, - output->data_type, - name_.c_str(), - 2 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input, - output); - { - int numdims = output->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = output->dims[i]; + name_.erase(last_underscore); + + name_ += ".lora"; + std::cout << "Adding layer " << name_ << std::endl; + Layer *peft_layer = new Layer(this, + OP_LORA, + output->data_type, + name_.c_str(), + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input, + output); + { + int numdims = output->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = output->dims[i]; + } + peft_layer->outputs[0] = + create_tensor_legion_ordering(numdims, + dims, + output->data_type, + peft_layer, + 0, + true /*create_grad*/); } - peft_layer->outputs[0] = - create_tensor_legion_ordering(numdims, - dims, - output->data_type, - peft_layer, - 0, - true /*create_grad*/); + layers.insert(it + 1, peft_layer); + ++it; + base_layer_to_peft_layer[target_module] = peft_layer; + peft_layer_to_peft_id[peft_layer] = std::vector(); + peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); } - layers.insert(layers.begin() + idx + 1, peft_layer); - base_layer_to_peft_layer[target_module] = peft_layer; - peft_layer_to_peft_id[peft_layer] = std::vector(); - peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); } } From c5d735fe56efc556da33d8882c817bb6858894d3 Mon Sep 17 00:00:00 2001 From: april-yyt Date: Tue, 12 Mar 2024 19:30:16 +0000 Subject: [PATCH 15/32] fix issues for downloading peft model --- inference/utils/download_peft_model.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py index 5c7704b6f0..ea4c96a05f 100644 --- a/inference/utils/download_peft_model.py +++ b/inference/utils/download_peft_model.py @@ -5,6 +5,9 @@ def parse_args(): parser = argparse.ArgumentParser() + parser.add_argument( + "--base_model_name", type=str, help="Name of the model to download" + ) parser.add_argument( "peft_model_ids", type=str, nargs="+", help="Name of the model(s) to download" ) @@ -44,7 +47,14 @@ def main(args): for peft_model_id in args.peft_model_ids: for data_type in data_types: + llm = ff.LLM( + args.base_model_name, + data_type=data_type, + cache_path=args.cache_folder, + refresh_cache=args.refresh_cache, + ) peft = ff.PEFT( + llm, peft_model_id, data_type=data_type, cache_path=args.cache_folder, From 5883eb635736a5f3684822aa221e15d84f6b34e3 Mon Sep 17 00:00:00 2001 From: april-yyt Date: Fri, 15 Mar 2024 04:35:25 +0000 Subject: [PATCH 16/32] solved issues for download peft model --- python/flexflow/serve/serve.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 7e63b5055c..4c0502a2e7 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -154,7 +154,7 @@ def download_hf_config(self): print(f"Saving {self.model_name} configs to file {self.config_path}...") self.hf_config.to_json_file(self.config_path) - def __get_revision_hashes(self, model_name: str, weights: bool): + def _get_revision_hashes(self, model_name: str, weights: bool): ff_revision = None ff_revision_file = ( os.path.join(self.weights_path, "rev_sha.txt") @@ -201,7 +201,7 @@ def download_hf_weights_if_needed(self): os.makedirs(self.weights_path, exist_ok=True) #print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + ff_revision, ff_revision_file, latest_revision = self._get_revision_hashes( self.model_name, weights=True ) @@ -541,6 +541,7 @@ def __init__( refresh_cache: bool = False, ): self.peft_model_id = peft_model_id + self.model_name = peft_model_id self.hf_config = config if config is not None else PeftConfig.from_pretrained(peft_model_id) self.peft_type = self.hf_config.peft_type if self.peft_type != "LORA": @@ -581,8 +582,9 @@ def default(self, obj): self.base_model.download_hf_config() - def __get_revision_hashes(self, peft_model_id: str): - return super().__get_revision_hashes(peft_model_id, weights=True) + def _get_revision_hashes(self, peft_model_id: str, weights: bool): + model_name = self.peft_model_id + return super()._get_revision_hashes(model_name, weights) def convert_peft_model(self, hf_peft_model, weights_path): for name, params in hf_peft_model.named_parameters(): @@ -619,8 +621,9 @@ def download_hf_weights_if_needed(self): os.makedirs(self.weights_path, exist_ok=True) #print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.peft_model_id + ff_revision, ff_revision_file, latest_revision = self._get_revision_hashes( + self.peft_model_id, + True ) # Download if needed From ce44ff95bde86606731dfcd07b2eea5ca7ac44fb Mon Sep 17 00:00:00 2001 From: april-yyt Date: Wed, 20 Mar 2024 22:40:32 +0000 Subject: [PATCH 17/32] added printouts for debugging --- src/runtime/inference_manager.cc | 33 ++++++++++++++++++++++++++++++++ src/runtime/request_manager.cc | 28 +++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 91a6dab9b5..638ded2823 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -54,10 +54,28 @@ bool parallel_tensor_list_overlaps(std::vector const &list1, } void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { + + // Check if the model object exists + if (model == nullptr) { + std::cout << "###PEFT DEBUGGING### Model object does not exist." << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### Model object exists." << std::endl; + } + // TODO: currently assume there is a single data-parallel pipeline // (i.e., data-parallel-degree == 1) assert(model->config.data_parallelism_degree == 1); model->config.batchSize = BatchConfig::max_tokens_per_batch(); + + // Check if the model object exists after importing config + if (model == nullptr) { + std::cout << "###PEFT DEBUGGING### Model object does not exist after setting config and batch size." << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### Model object still exists." << std::endl; + } + model->compile_inference(); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; @@ -609,17 +627,23 @@ void FFModel::set_position_offset(int offset) { } void FFModel::compile_inference() { + std::cout << "###PEFT DEBUGGING### Entering compile_inference." << std::endl; + // Request at least four CPU processors for inference runs assert( config.cpusPerNode >= 4 && "FlexFlow Serve requires at least four CPU cores per node, please add " "`-ll:cpu 4` in the command line if you are using the C++ interface or " "set `num_cpus` in `ff.init` if you are using the Python interface"); + + std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node." << std::endl; Context ctx = config.lg_ctx; Runtime *runtime = config.lg_hlr; config.computationMode = COMP_MODE_INFERENCE; create_operators_from_layers(); + // Launch the graph optimize task + std::cout << "###PEFT DEBUGGING### Launching graph optimization task." << std::endl; { FFModel *model = this; TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, @@ -670,6 +694,11 @@ void FFModel::compile_inference() { } } } + + std::cout << "###PEFT DEBUGGING### Operators reconstructed from optimized graph." << std::endl; + // Perform inplace optimizations + std::cout << "###PEFT DEBUGGING### Starting inplace optimizations." << std::endl; + loss_op = nullptr; metrics_op = nullptr; // Perform inplace optimizations @@ -709,6 +738,8 @@ void FFModel::compile_inference() { } } + // Output tensor mapping + std::cout << "###PEFT DEBUGGING### Mapping output tensors." << std::endl; for (size_t l = 0; l < operators.size(); l++) { Op *op = operators[l]; @@ -734,6 +765,7 @@ void FFModel::compile_inference() { } #ifdef FF_USE_NCCL + std::cout << "###PEFT DEBUGGING### Setting up NCCL communications." << std::endl; for (size_t l = 0; l < operators.size(); l++) { // Only create nccl for allreduce and fusedop for inference // (fusedop may include allreduces) @@ -770,6 +802,7 @@ void FFModel::compile_inference() { } } #endif + std::cout << "###PEFT DEBUGGING### compile_inference completed successfully." << std::endl; } std::string join_path(std::vector const &paths) { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index e9e21df52a..535278a3d9 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2450,6 +2450,16 @@ void RequestManager::background_serving_task( std::vector const ®ions, Context ctx, Runtime *runtime) { + + + auto print_timestamped_message = [](const std::string& message) { + auto now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - " << message << std::endl; + }; + + // Print at the start of the task + print_timestamped_message("###PEFT DEBUGGING### Starting background serving task."); + RequestManager *rm = RequestManager::get_request_manager(); FFModel *llm = *(FFModel **)task->args; { @@ -2466,6 +2476,11 @@ void RequestManager::background_serving_task( ssm->config.lg_ctx = ctx; } } + + // Checkpoint print + print_timestamped_message("###PEFT DEBUGGING### Updated models' configuration."); + + if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding rm->serve_incr_decoding(llm); @@ -2473,6 +2488,10 @@ void RequestManager::background_serving_task( // Registered SSMs: perform speculative inference rm->serve_spec_infer(llm); } + + // Print at the end of the task + print_timestamped_message("###PEFT DEBUGGING### Background serving task completed."); + } std::string find_layer_name_from_guid(FFModel *model, LayerID guid) { @@ -2497,6 +2516,15 @@ bool is_peft_operator_type(OperatorType type) { /*static*/ void RequestManager::serve_incr_decoding(FFModel *llm) { + + // Check if the model object exists + if (llm == nullptr) { + std::cout << "###PEFT DEBUGGING### LLM Model object does not exist." << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl; + } + Context ctx = llm->config.lg_ctx; Runtime *runtime = llm->config.lg_hlr; // Compile the llm From 4adf6ea8ac6ea5ec4f96498446c88f7cbbbc0f76 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 21 Mar 2024 05:13:41 +0000 Subject: [PATCH 18/32] fix --- inference/utils/download_peft_model.py | 1 + python/flexflow/serve/serve.py | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py index ea4c96a05f..bc2ba59b30 100644 --- a/inference/utils/download_peft_model.py +++ b/inference/utils/download_peft_model.py @@ -62,6 +62,7 @@ def main(args): ) peft.download_hf_weights_if_needed() peft.download_hf_config() + peft.download_hf_tokenizer_if_needed() if __name__ == "__main__": diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 4c0502a2e7..9997527f0d 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -154,7 +154,7 @@ def download_hf_config(self): print(f"Saving {self.model_name} configs to file {self.config_path}...") self.hf_config.to_json_file(self.config_path) - def _get_revision_hashes(self, model_name: str, weights: bool): + def __get_revision_hashes(self, model_name: str, weights: bool): ff_revision = None ff_revision_file = ( os.path.join(self.weights_path, "rev_sha.txt") @@ -201,7 +201,7 @@ def download_hf_weights_if_needed(self): os.makedirs(self.weights_path, exist_ok=True) #print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - ff_revision, ff_revision_file, latest_revision = self._get_revision_hashes( + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( self.model_name, weights=True ) @@ -582,9 +582,9 @@ def default(self, obj): self.base_model.download_hf_config() - def _get_revision_hashes(self, peft_model_id: str, weights: bool): + def __get_revision_hashes(self, peft_model_id: str, weights: bool): model_name = self.peft_model_id - return super()._get_revision_hashes(model_name, weights) + return self._LLM__get_revision_hashes(model_name, weights) def convert_peft_model(self, hf_peft_model, weights_path): for name, params in hf_peft_model.named_parameters(): @@ -621,7 +621,7 @@ def download_hf_weights_if_needed(self): os.makedirs(self.weights_path, exist_ok=True) #print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - ff_revision, ff_revision_file, latest_revision = self._get_revision_hashes( + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( self.peft_model_id, True ) From 53e8919dc65b2e4735cbc609a23813d55fda3f36 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 22 Mar 2024 19:45:40 +0000 Subject: [PATCH 19/32] fix seg fault --- config/config.linux | 2 +- include/flexflow/ops/lora_linear_params.h | 4 +- inference/incr_decoding/incr_decoding.cc | 2 +- src/ops/lora_linear.cc | 97 +++++++++-------------- src/ops/lora_linear_params.cc | 14 ++-- src/runtime/inference_manager.cc | 31 +++++--- src/runtime/request_manager.cc | 23 +++--- 7 files changed, 81 insertions(+), 92 deletions(-) diff --git a/config/config.linux b/config/config.linux index 4c70f95d3f..30edfa7dfe 100755 --- a/config/config.linux +++ b/config/config.linux @@ -83,7 +83,7 @@ FF_MAX_DIM=${FF_MAX_DIM:-5} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY:-OFF} # set LEGION_MAX_RETURN_SIZE -LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-2097152} +LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-262144} # set ROCM path ROCM_PATH=${ROCM_PATH:-"/opt/rocm"} diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index dfc78d0683..ff041334f1 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -17,7 +17,7 @@ class LoraLinearConfig { LoraLinearConfig(int rank, OptimizerType type = OPTIMIZER_TYPE_SGD, float learning_rate = 1e-4); - LoraLinearConfig(std::string const &config_folder_, + LoraLinearConfig(std::string const &cache_folder_, std::string const &peft_model_id_); friend bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs); @@ -28,7 +28,7 @@ class LoraLinearConfig { int rank; OptimizerType optimizer_type; float learning_rate; - std::string config_folder; + std::string cache_folder; // Huggingface std::string peft_model_id; int lora_alpha; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 61d56e62e1..e7d4cf16fb 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -231,7 +231,7 @@ void FlexFlow::top_level_task(Task const *task, LoraLinearConfig peft_config = peft_model_name.empty() ? LoraLinearConfig::EmptyConfig - : LoraLinearConfig(join_path({file_paths.cache_folder_path, "configs"}), peft_model_name); + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); GenerationConfig generationConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 3a597ad540..39934f4cce 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -38,10 +38,13 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::LoraLinear; -bool check_lora_layer_match(Layer *potential_target, std::string target_module_name) { - if (potential_target->op_type == OP_LINEAR && potential_target->name != nullptr && strlen(potential_target->name) > 0) { +bool check_lora_layer_match(Layer *potential_target, + std::string target_module_name) { + if (potential_target->op_type == OP_LINEAR && + potential_target->name != nullptr && strlen(potential_target->name) > 0) { std::string s(potential_target->name); - if (s.find(target_module_name) != std::string::npos && s.find("lora") == std::string::npos) { + if (s.find(target_module_name) != std::string::npos && + s.find("lora") == std::string::npos) { return true; } } @@ -65,7 +68,9 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { for (auto it = layers.begin(); it != layers.end(); ++it) { Layer *target_module = *it; bool match = check_lora_layer_match(target_module, target_module_name); - if (!match) continue; + if (!match) { + continue; + } if (base_layer_to_peft_layer.find(target_module) != base_layer_to_peft_layer.end()) { @@ -76,11 +81,13 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { Tensor const input = target_module->inputs[0]; Tensor const output = target_module->outputs[0]; assert(input->data_type == output->data_type); - std::string name_ = target_module->name ? std::string(target_module->name) - : std::string(""); + std::string name_ = target_module->name + ? std::string(target_module->name) + : std::string(""); size_t last_underscore = name_.length() - 1; for (int i = name_.length() - 1; i > 0; i--) { - if (!(std::isdigit(target_module->name[i]) || target_module->name[i] == '_')) { + if (!(std::isdigit(target_module->name[i]) || + target_module->name[i] == '_')) { break; } else if (target_module->name[i] == '_') { last_underscore = i; @@ -375,21 +382,22 @@ OpMeta *LoraLinear::init_task(Task const *task, // load weights from file std::string weights_folder_filepath = join_path({ - lora_config.config_folder, + lora_config.cache_folder, + "weights", lora_config.peft_model_id, dt == DT_FLOAT ? "full-precision" : "half-precision", }); std::string w0_filepath = join_path( - {weights_folder_filepath, lora_layername_substr + "_A_weight"}); + {weights_folder_filepath, lora_layername_substr + "_A.weight"}); std::string w1_filepath = join_path( - {weights_folder_filepath, lora_layername_substr + "_B_weight"}); + {weights_folder_filepath, lora_layername_substr + "_B.weight"}); if (dt == DT_FLOAT) { - std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" + std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight" << ", size: " << w0_num_elements << ", shard: " << shard_id << std::endl; load_peft_from_file( (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); - std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" + std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight" << ", size: " << w1_num_elements << ", shard: " << shard_id << std::endl; load_peft_from_file((float *)weight.w1_ptr, @@ -398,12 +406,12 @@ OpMeta *LoraLinear::init_task(Task const *task, shard_id, w1_filepath); } else if (dt == DT_HALF) { - std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" + std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight" << ", size: " << w0_num_elements << ", shard: " << shard_id << std::endl; load_peft_from_file( (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); - std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" + std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight" << ", size: " << w1_num_elements << ", shard: " << shard_id << std::endl; load_peft_from_file( @@ -818,25 +826,14 @@ void LoraLinear::serialize(Legion::Serializer &sez) const { for (auto const &kv : this->peft_configs) { // Serialize PEFTModelID sez.serialize(kv.first.id); - // Serialize LoraLinearConfig - sez.serialize(kv.second.rank); - sez.serialize(kv.second.optimizer_type); - sez.serialize(kv.second.learning_rate); - sez.serialize(kv.second.config_folder.length()); - sez.serialize(kv.second.config_folder.c_str(), - kv.second.config_folder.length()); + // Serialize LoraConfig's cache folder + sez.serialize(kv.second.cache_folder.length()); + sez.serialize(kv.second.cache_folder.c_str(), + kv.second.cache_folder.length()); + // Serialize LoraConfig's peft model id sez.serialize(kv.second.peft_model_id.length()); sez.serialize(kv.second.peft_model_id.c_str(), kv.second.peft_model_id.length()); - sez.serialize(kv.second.lora_alpha); - sez.serialize(kv.second.lora_dropout); - sez.serialize(kv.second.target_modules.size()); - sez.serialize(kv.second.load_weights_from_file); - for (int i = 0; i < kv.second.target_modules.size(); i++) { - sez.serialize(kv.second.target_modules[i].length()); - sez.serialize(kv.second.target_modules[i].c_str(), - kv.second.target_modules[i].length()); - } } sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); @@ -867,44 +864,22 @@ Node LoraLinear::deserialize(FFModel &ff, size_t pid; dez.deserialize(pid); PEFTModelID peft_model_id(pid); - // Deserialize LoraLinearConfig - int rank; - OptimizerType optimizer_type; - float learning_rate; - dez.deserialize(rank); - dez.deserialize(optimizer_type); - dez.deserialize(learning_rate); - LoraLinearConfig lora_linear_config(rank, optimizer_type, learning_rate); + + // Deserialize LoraConfig's cache folder size_t string_size; char buffer[4096] = {0}; - // deserialize config_folder dez.deserialize(string_size); dez.deserialize(buffer, string_size); - lora_linear_config.config_folder = std::string(buffer); + std::string cache_folder = std::string(buffer); + + // Deserialize LoraConfig's peft model id string_size = 0; memset(buffer, 0, 4096); - // deserialize peft_model_id dez.deserialize(string_size); dez.deserialize(buffer, string_size); - lora_linear_config.peft_model_id = std::string(buffer); - string_size = 0; - memset(buffer, 0, 4096); - // deserialize lora_alpha and lora_dropout - dez.deserialize(lora_linear_config.lora_alpha); - dez.deserialize(lora_linear_config.lora_dropout); - // deserialize target_modules - size_t num_target_modules = 0; - dez.deserialize(num_target_modules); - for (int i = 0; i < num_target_modules; i++) { - dez.deserialize(string_size); - dez.deserialize(buffer, string_size); - lora_linear_config.target_modules.push_back(std::string(buffer)); - string_size = 0; - memset(buffer, 0, 4096); - } - // deserialize load_weights_from_file - dez.deserialize(lora_linear_config.load_weights_from_file); - // Append entry to list + std::string peft_model_name = std::string(buffer); + + LoraLinearConfig lora_linear_config(cache_folder, peft_model_name); params.peft_configs.emplace( std::make_pair(peft_model_id, lora_linear_config)); } @@ -956,7 +931,7 @@ size_t hash::operator()( hash_combine(key, kv.second.rank); hash_combine(key, kv.second.optimizer_type); hash_combine(key, kv.second.learning_rate); - hash_combine(key, kv.second.config_folder); + hash_combine(key, kv.second.cache_folder); hash_combine(key, kv.second.peft_model_id); hash_combine(key, kv.second.lora_alpha); hash_combine(key, kv.second.lora_dropout); diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 771cf94906..1b142d5577 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -9,20 +9,20 @@ const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig(); LoraLinearConfig::LoraLinearConfig() : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f), - config_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f), + cache_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f), load_weights_from_file(false) {} LoraLinearConfig::LoraLinearConfig(int _rank, OptimizerType _type, float _lr) - : rank(_rank), optimizer_type(_type), learning_rate(_lr), config_folder(""), + : rank(_rank), optimizer_type(_type), learning_rate(_lr), cache_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f), load_weights_from_file(false) {} -LoraLinearConfig::LoraLinearConfig(std::string const &config_folder_, +LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_, std::string const &peft_model_id_) { - config_folder = config_folder_; + cache_folder = cache_folder_; peft_model_id = peft_model_id_; std::string peft_inference_config_file_path = - join_path({config_folder, peft_model_id, "config.json"}); + join_path({cache_folder, "configs", peft_model_id, "config.json"}); std::ifstream config_file(peft_inference_config_file_path); if (config_file.is_open()) { try { @@ -52,7 +52,7 @@ LoraLinearConfig::LoraLinearConfig(std::string const &config_folder_, bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) { if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type && lhs.learning_rate == rhs.learning_rate && - lhs.config_folder == rhs.config_folder && + lhs.cache_folder == rhs.cache_folder && lhs.peft_model_id == rhs.peft_model_id && lhs.lora_alpha == rhs.lora_alpha && lhs.lora_dropout == rhs.lora_dropout && @@ -73,7 +73,7 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { os << "rank: " << llc.rank << ", "; os << "optimizer_type: " << llc.optimizer_type << ", "; os << "learning_rate: " << llc.learning_rate << ", "; - os << "config_folder: " << llc.config_folder << ", "; + os << "cache_folder: " << llc.cache_folder << ", "; os << "peft_model_id: " << llc.peft_model_id << ", "; os << "lora_alpha: " << llc.lora_alpha << ", "; os << "lora_dropout: " << llc.lora_dropout << ", "; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 638ded2823..212d0ebf6b 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -57,7 +57,8 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { // Check if the model object exists if (model == nullptr) { - std::cout << "###PEFT DEBUGGING### Model object does not exist." << std::endl; + std::cout << "###PEFT DEBUGGING### Model object does not exist." + << std::endl; return; // Early return to prevent further operations on a nullptr } else { std::cout << "###PEFT DEBUGGING### Model object exists." << std::endl; @@ -70,12 +71,14 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { // Check if the model object exists after importing config if (model == nullptr) { - std::cout << "###PEFT DEBUGGING### Model object does not exist after setting config and batch size." << std::endl; + std::cout << "###PEFT DEBUGGING### Model object does not exist after " + "setting config and batch size." + << std::endl; return; // Early return to prevent further operations on a nullptr } else { std::cout << "###PEFT DEBUGGING### Model object still exists." << std::endl; } - + model->compile_inference(); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; @@ -628,7 +631,7 @@ void FFModel::set_position_offset(int offset) { void FFModel::compile_inference() { std::cout << "###PEFT DEBUGGING### Entering compile_inference." << std::endl; - + // Request at least four CPU processors for inference runs assert( config.cpusPerNode >= 4 && @@ -636,14 +639,17 @@ void FFModel::compile_inference() { "`-ll:cpu 4` in the command line if you are using the C++ interface or " "set `num_cpus` in `ff.init` if you are using the Python interface"); - std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node." << std::endl; + std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four " + "CPU cores per node." + << std::endl; Context ctx = config.lg_ctx; Runtime *runtime = config.lg_hlr; config.computationMode = COMP_MODE_INFERENCE; create_operators_from_layers(); // Launch the graph optimize task - std::cout << "###PEFT DEBUGGING### Launching graph optimization task." << std::endl; + std::cout << "###PEFT DEBUGGING### Launching graph optimization task." + << std::endl; { FFModel *model = this; TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, @@ -695,9 +701,12 @@ void FFModel::compile_inference() { } } - std::cout << "###PEFT DEBUGGING### Operators reconstructed from optimized graph." << std::endl; + std::cout + << "###PEFT DEBUGGING### Operators reconstructed from optimized graph." + << std::endl; // Perform inplace optimizations - std::cout << "###PEFT DEBUGGING### Starting inplace optimizations." << std::endl; + std::cout << "###PEFT DEBUGGING### Starting inplace optimizations." + << std::endl; loss_op = nullptr; metrics_op = nullptr; @@ -765,7 +774,8 @@ void FFModel::compile_inference() { } #ifdef FF_USE_NCCL - std::cout << "###PEFT DEBUGGING### Setting up NCCL communications." << std::endl; + std::cout << "###PEFT DEBUGGING### Setting up NCCL communications." + << std::endl; for (size_t l = 0; l < operators.size(); l++) { // Only create nccl for allreduce and fusedop for inference // (fusedop may include allreduces) @@ -802,7 +812,8 @@ void FFModel::compile_inference() { } } #endif - std::cout << "###PEFT DEBUGGING### compile_inference completed successfully." << std::endl; + std::cout << "###PEFT DEBUGGING### compile_inference completed successfully." + << std::endl; } std::string join_path(std::vector const &paths) { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 535278a3d9..31742bd826 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2451,14 +2451,16 @@ void RequestManager::background_serving_task( Context ctx, Runtime *runtime) { - - auto print_timestamped_message = [](const std::string& message) { - auto now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); - std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - " << message << std::endl; + auto print_timestamped_message = [](std::string const &message) { + auto now = + std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - " + << message << std::endl; }; // Print at the start of the task - print_timestamped_message("###PEFT DEBUGGING### Starting background serving task."); + print_timestamped_message( + "###PEFT DEBUGGING### Starting background serving task."); RequestManager *rm = RequestManager::get_request_manager(); FFModel *llm = *(FFModel **)task->args; @@ -2478,8 +2480,8 @@ void RequestManager::background_serving_task( } // Checkpoint print - print_timestamped_message("###PEFT DEBUGGING### Updated models' configuration."); - + print_timestamped_message( + "###PEFT DEBUGGING### Updated models' configuration."); if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding @@ -2490,8 +2492,8 @@ void RequestManager::background_serving_task( } // Print at the end of the task - print_timestamped_message("###PEFT DEBUGGING### Background serving task completed."); - + print_timestamped_message( + "###PEFT DEBUGGING### Background serving task completed."); } std::string find_layer_name_from_guid(FFModel *model, LayerID guid) { @@ -2519,7 +2521,8 @@ void RequestManager::serve_incr_decoding(FFModel *llm) { // Check if the model object exists if (llm == nullptr) { - std::cout << "###PEFT DEBUGGING### LLM Model object does not exist." << std::endl; + std::cout << "###PEFT DEBUGGING### LLM Model object does not exist." + << std::endl; return; // Early return to prevent further operations on a nullptr } else { std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl; From ebf8bd95340b224e5306ee33021406add10251ea Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 22 Mar 2024 20:30:24 +0000 Subject: [PATCH 20/32] add test, separate peft script in cpp --- CMakeLists.txt | 1 + inference/incr_decoding/incr_decoding.cc | 41 +-- inference/peft/CMakeLists.txt | 38 +++ inference/peft/Makefile | 37 +++ inference/peft/peft.cc | 325 +++++++++++++++++++++++ tests/peft_test.sh | 28 ++ 6 files changed, 435 insertions(+), 35 deletions(-) create mode 100644 inference/peft/CMakeLists.txt create mode 100644 inference/peft/Makefile create mode 100644 inference/peft/peft.cc create mode 100755 tests/peft_test.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ce4f7044..22770b6c28 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -558,6 +558,7 @@ if(NOT BUILD_LEGION_ONLY) if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(inference/spec_infer) add_subdirectory(inference/incr_decoding) + add_subdirectory(inference/peft) endif() diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index e7d4cf16fb..177a4dd156 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -40,7 +40,6 @@ void parse_input_args(char **argv, int argc, FilePaths &paths, std::string &llm_model_name, - std::string &peft_model_name, bool &use_full_precision, bool &verbose, bool &do_sample, @@ -59,17 +58,6 @@ void parse_input_args(char **argv, } continue; } - if (!strcmp(argv[i], "-enable-peft")) { - enable_peft = true; - continue; - } - if (!strcmp(argv[i], "-peft-model")) { - peft_model_name = std::string(argv[++i]); - for (char &c : peft_model_name) { - c = std::tolower(c); - } - continue; - } // cache folder if (!strcmp(argv[i], "-cache-folder")) { paths.cache_folder_path = std::string(argv[++i]); @@ -138,7 +126,7 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "Doesn't support quantization in non-offload mode"); } FilePaths file_paths; - std::string llm_model_name, peft_model_name; + std::string llm_model_name; bool use_full_precision = false; bool verbose = false; bool do_sample = false; @@ -156,7 +144,6 @@ void FlexFlow::top_level_task(Task const *task, argc, file_paths, llm_model_name, - peft_model_name, use_full_precision, verbose, do_sample, @@ -166,6 +153,7 @@ void FlexFlow::top_level_task(Task const *task, max_requests_per_batch, max_tokens_per_batch, max_sequence_length); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); @@ -280,13 +268,6 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } - // Add PEFT layer - PEFTModelID peft_model_id = PEFTModelID::NO_ID; - if (!peft_model_name.empty()) { - peft_model_id = model.add_lora_layer(peft_config); - } - - // Start background server rm->start_background_server(&model); int total_num_requests = 0; @@ -303,20 +284,10 @@ void FlexFlow::top_level_task(Task const *task, for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - // Add inference request - // Request inference_req; - // inference_req.prompt = text; - // inference_req.max_sequence_length = 128; - // inference_req.peft_model_id = peft_model_id; - // requests.push_back(inference_req); - // total_num_requests++; - // Add fine-tuning request - Request fine_tuning_req; - fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; - fine_tuning_req.max_sequence_length = 128; - fine_tuning_req.peft_model_id = peft_model_id; - fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); - requests.push_back(fine_tuning_req); + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + requests.push_back(inference_req); total_num_requests++; } std::vector result = model.generate(requests); diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt new file mode 100644 index 0000000000..4547907176 --- /dev/null +++ b/inference/peft/CMakeLists.txt @@ -0,0 +1,38 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlow_Peft) +set(project_target peft) + + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + peft.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target} ${CPU_SRC}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target} ${CPU_SRC}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/inference/peft/Makefile b/inference/peft/Makefile new file mode 100644 index 0000000000..0e4b79f51f --- /dev/null +++ b/inference/peft/Makefile @@ -0,0 +1,37 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= llama_pipeline +# List all the application source files here +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc new file mode 100644 index 0000000000..d376c3e39c --- /dev/null +++ b/inference/peft/peft.cc @@ -0,0 +1,325 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include "models/starcoder.h" +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +LegionRuntime::Logger::Category log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + paths.cache_folder_path = "~/.cache/flexflow"; + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch(max_requests_per_batch); + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Register PEFT layer + LoraLinearConfig mlp_second = + peft_model_name.empty() + ? LoraLinearConfig::DefaultConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + PEFTModelID peft_model_id = + peft_model_name.empty() + ? PEFTModelID::NO_ID + : model.register_peft_model( + LoraLinearConfig::DefaultConfig /*mlp_first*/, + mlp_second /*mlp_second*/); + + // Start background server + rm->start_background_server(&model); + + int total_num_requests = 0; + { + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + std::vector requests; + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + // Add inference request + // Request inference_req; + // inference_req.prompt = text; + // inference_req.max_sequence_length = 128; + // inference_req.peft_model_id = peft_model_id; + // requests.push_back(inference_req); + // total_num_requests++; + // Add fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = 128; + fine_tuning_req.peft_model_id = peft_model_id; + fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); + requests.push_back(fine_tuning_req); + total_num_requests++; + } + std::vector result = model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/tests/peft_test.sh b/tests/peft_test.sh new file mode 100755 index 0000000000..8f6d53725b --- /dev/null +++ b/tests/peft_test.sh @@ -0,0 +1,28 @@ +#! /usr/bin/env bash +set -x +set -e + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}" + +# Token to access private huggingface models (e.g. LLAMA-2) +HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none} +if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then + huggingface-cli login --token "$HUGGINGFACE_TOKEN" +fi + +# Create test prompt file +mkdir -p ../inference/prompt +echo '["Two things are infinite: "]' > ../inference/prompt/peft.json + +# Create output folder +mkdir -p ../inference/output + +# Enable backtrace in case we run into a segfault or assertion failure +export LEGION_BACKTRACE=1 + +# Download test model +python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m +# if first time, add: --refresh-cache + +./inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft From acef0067ade7d50e144cceabcd17caf534133622 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 22 Mar 2024 20:36:19 +0000 Subject: [PATCH 21/32] fix --- inference/incr_decoding/incr_decoding.cc | 17 ----------- inference/peft/peft.cc | 37 +++++++++++++++++------- tests/peft_test.sh | 2 +- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 177a4dd156..c3993b1ad4 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -43,7 +43,6 @@ void parse_input_args(char **argv, bool &use_full_precision, bool &verbose, bool &do_sample, - bool &enable_peft, float &temperature, float &topp, int &max_requests_per_batch, @@ -130,7 +129,6 @@ void FlexFlow::top_level_task(Task const *task, bool use_full_precision = false; bool verbose = false; bool do_sample = false; - bool enable_peft = false; float temperature = 0.0f; float topp = 0.0f; int max_requests_per_batch = 8; @@ -147,7 +145,6 @@ void FlexFlow::top_level_task(Task const *task, use_full_precision, verbose, do_sample, - enable_peft, temperature, topp, max_requests_per_batch, @@ -173,14 +170,6 @@ void FlexFlow::top_level_task(Task const *task, << std::endl; assert(false); } - if (enable_peft && peft_model_name.empty()) { - std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; - assert(false); - } else if (!enable_peft && !peft_model_name.empty()) { - std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; - assert(false); - } - json model_config = json::parse(config_file_handle, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, @@ -215,12 +204,6 @@ void FlexFlow::top_level_task(Task const *task, assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); - // load PEFT config - LoraLinearConfig peft_config = - peft_model_name.empty() - ? LoraLinearConfig::EmptyConfig - : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); - GenerationConfig generationConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); rm->set_max_requests_per_batch(max_requests_per_batch); diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index d376c3e39c..e7d4cf16fb 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -44,6 +44,7 @@ void parse_input_args(char **argv, bool &use_full_precision, bool &verbose, bool &do_sample, + bool &enable_peft, float &temperature, float &topp, int &max_requests_per_batch, @@ -58,6 +59,10 @@ void parse_input_args(char **argv, } continue; } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } if (!strcmp(argv[i], "-peft-model")) { peft_model_name = std::string(argv[++i]); for (char &c : peft_model_name) { @@ -137,6 +142,7 @@ void FlexFlow::top_level_task(Task const *task, bool use_full_precision = false; bool verbose = false; bool do_sample = false; + bool enable_peft = false; float temperature = 0.0f; float topp = 0.0f; int max_requests_per_batch = 8; @@ -154,6 +160,7 @@ void FlexFlow::top_level_task(Task const *task, use_full_precision, verbose, do_sample, + enable_peft, temperature, topp, max_requests_per_batch, @@ -178,6 +185,14 @@ void FlexFlow::top_level_task(Task const *task, << std::endl; assert(false); } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + json model_config = json::parse(config_file_handle, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, @@ -212,6 +227,12 @@ void FlexFlow::top_level_task(Task const *task, assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + GenerationConfig generationConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); rm->set_max_requests_per_batch(max_requests_per_batch); @@ -259,17 +280,11 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } - // Register PEFT layer - LoraLinearConfig mlp_second = - peft_model_name.empty() - ? LoraLinearConfig::DefaultConfig - : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); - PEFTModelID peft_model_id = - peft_model_name.empty() - ? PEFTModelID::NO_ID - : model.register_peft_model( - LoraLinearConfig::DefaultConfig /*mlp_first*/, - mlp_second /*mlp_second*/); + // Add PEFT layer + PEFTModelID peft_model_id = PEFTModelID::NO_ID; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } // Start background server rm->start_background_server(&model); diff --git a/tests/peft_test.sh b/tests/peft_test.sh index 8f6d53725b..778b225a26 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -25,4 +25,4 @@ export LEGION_BACKTRACE=1 python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m # if first time, add: --refresh-cache -./inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft +../build/inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft From 660bf732f068d1427ceab666a8b2b7fada399d20 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 22 Mar 2024 22:24:35 +0000 Subject: [PATCH 22/32] fixes --- src/ops/inc_multihead_self_attention.cu | 3 ++- src/runtime/request_manager.cc | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 83fdbaf927..8b0776fde4 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1488,7 +1488,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) { + if (bc->request_completed[i] || + (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) { continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 31742bd826..03157bcbbe 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2410,7 +2410,12 @@ std::vector RequestManager *rm = RequestManager::get_request_manager(); std::vector guids; for (int i = 0; i < requests.size(); i++) { - RequestManager::RequestGuid guid = rm->register_new_request(requests.at(i)); + RequestManager::RequestGuid guid; + if (requests.at(i).req_type == Request::REQ_INFERENCE) { + guid = rm->register_new_request(requests.at(i)); + } else { + guid = rm->register_new_peft_request(requests.at(i)); + } if (guid != RequestManager::INVALID_GUID) { guids.push_back(guid); } From 02985cef3af5ba5e55ee2f02e859b70c71f1569e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 23 Mar 2024 02:51:53 +0000 Subject: [PATCH 23/32] fix --- src/runtime/request_manager.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 03157bcbbe..c335cd246b 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -412,6 +412,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (request.completed_training_steps == request.max_training_steps) { // check if the fine tuning request has completed request.status = Request::COMPLETED; + trigger_request_completion_future(request.guid); log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%d)", old_bc.requestsInfo[i].request_guid, request.completed_training_steps); From 084732e9e4390d251fca54273d8e6c4f4c52684f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 23 Mar 2024 23:26:37 +0000 Subject: [PATCH 24/32] update peft python interface --- include/flexflow/flexflow_c.h | 22 ++ include/flexflow/model.h | 2 +- inference/peft/peft.cc | 9 +- inference/utils/download_peft_model.py | 33 +- python/flexflow/core/flexflow_cffi.py | 34 ++ python/flexflow/serve/serve.py | 484 +++++++------------------ src/c/flexflow_c.cc | 64 ++++ src/ops/lora_linear.cc | 12 +- 8 files changed, 284 insertions(+), 376 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index b7b20f2d2f..1ceea59839 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -55,6 +55,8 @@ FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t); FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t); FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t); FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t); +FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t); +FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t); // ----------------------------------------------------------------------- // FFConfig @@ -1036,6 +1038,26 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_); void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, flexflow_model_t model_handle_); +// ----------------------------------------------------------------------- +// LoraLinearConfig +// ----------------------------------------------------------------------- + +flexflow_lora_linear_config_t + flexflow_lora_linear_config_create(char const *cache_folder_, + char const *peft_model_id_); + +void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_); + +// ----------------------------------------------------------------------- +// PEFTModelID +// ----------------------------------------------------------------------- + +flexflow_peft_model_id_t flexflow_peft_model_id_create(); + +flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id); + +void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_); + #ifdef __cplusplus } #endif diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 74421ffc92..099e2209e4 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -837,7 +837,7 @@ class FFModel { // ======================================== // PEFT Layers // ======================================== - PEFTModelID add_lora_layer(LoraLinearConfig const peft_config); + PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); // ======================================== // Inference APIs // ======================================== diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index e7d4cf16fb..aa5581ca87 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -281,7 +281,7 @@ void FlexFlow::top_level_task(Task const *task, } // Add PEFT layer - PEFTModelID peft_model_id = PEFTModelID::NO_ID; + PEFTModelID *peft_model_id = nullptr; if (!peft_model_name.empty()) { peft_model_id = model.add_lora_layer(peft_config); } @@ -314,7 +314,8 @@ void FlexFlow::top_level_task(Task const *task, Request fine_tuning_req; fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; fine_tuning_req.max_sequence_length = 128; - fine_tuning_req.peft_model_id = peft_model_id; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); requests.push_back(fine_tuning_req); total_num_requests++; @@ -331,6 +332,10 @@ void FlexFlow::top_level_task(Task const *task, future.get_void_result(); } + if (peft_model_id != nullptr) { + free(peft_model_id); + } + // float* data std::cout << "----------inference finished--------------" << std::endl; diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py index bc2ba59b30..ad79816f84 100644 --- a/inference/utils/download_peft_model.py +++ b/inference/utils/download_peft_model.py @@ -9,7 +9,7 @@ def parse_args(): "--base_model_name", type=str, help="Name of the model to download" ) parser.add_argument( - "peft_model_ids", type=str, nargs="+", help="Name of the model(s) to download" + "peft_model_ids", type=str, nargs="+", help="Name of the PEFT model(s) to download" ) parser.add_argument( "--cache-folder", @@ -45,24 +45,19 @@ def main(args): else: data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) - for peft_model_id in args.peft_model_ids: - for data_type in data_types: - llm = ff.LLM( - args.base_model_name, - data_type=data_type, - cache_path=args.cache_folder, - refresh_cache=args.refresh_cache, - ) - peft = ff.PEFT( - llm, - peft_model_id, - data_type=data_type, - cache_path=args.cache_folder, - refresh_cache=args.refresh_cache, - ) - peft.download_hf_weights_if_needed() - peft.download_hf_config() - peft.download_hf_tokenizer_if_needed() + + for data_type in data_types: + llm = ff.LLM( + args.base_model_name, + data_type=data_type, + cache_path=args.cache_folder, + refresh_cache=args.refresh_cache, + ) + for peft_model_id in args.peft_model_ids: + llm.add_peft(peft_model_id) + llm.download_hf_weights_if_needed() + llm.download_hf_config() + llm.download_hf_tokenizer_if_needed() if __name__ == "__main__": diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index b92a0a92af..ef0ee0e378 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -4287,3 +4287,37 @@ def load_weights(self, model): ffc().flexflow_file_data_loader_load_weights( self.handle, model.handle ) + +# ----------------------------------------------------------------------- +# LoraLinearConfig +# ----------------------------------------------------------------------- + +class LoraLinearConfig(object): + __slots__ = ["handle", "_handle"] + + def __init__( + self, + cache_folder, + peft_model_id, + ): + c_cache_folder = get_c_name(cache_folder) + peft_model_id = get_c_name(peft_model_id) + self.handle = ffc().flexflow_lora_linear_config_create( + c_cache_folder, + peft_model_id, + ) + self._handle = ffi.gc(self.handle, ffc().flexflow_lora_linear_config_destroy) + +# ----------------------------------------------------------------------- +# PEFTModelID +# ----------------------------------------------------------------------- + +class PEFTModelID(object): + __slots__ = ["handle", "_handle"] + + def __init__(self, id=None): + if id is None: + self.handle = ffc().flexflow_peft_model_id_create() + else: + self.handle = ffc().flexflow_peft_model_id_create_id(id) + self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 9997527f0d..bc7a796315 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -137,30 +137,61 @@ def __init__( self.refresh_cache = refresh_cache self.output_file = output_file self.rm = None + self.pefts = [] def __del__(self): # Stop the background server before deleting the object if type(self) == LLM and self.rm is not None: self.rm.stop_server() + def add_peft(self, peft_model_id: str): + """Add a previously created PEFT adapter to the LLM. The PEFT model should already exist locally or be available on HuggingFace""" + peft_config = PeftConfig.from_pretrained(peft_model_id) + peft_type = peft_config.peft_type + if peft_type != "LORA": + raise RuntimeError(f"PEFT type {peft_type} not yet supported in FlexFlow") + if "base_model_name_or_path" not in peft_config.to_dict(): + raise ValueError( + f"PEFT model {peft_model_id} does not have an associated base model" + ) + if peft_config.base_model_name_or_path != self.model_name: + raise RuntimeError(f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}") + ff_peft_config = LoraLinearConfig(self.cache_path, peft_model_id) + peft_dict = { + "peft_config": peft_config, + "peft_type": peft_type, + "ff_peft_config": ff_peft_config, + } + self.pefts[peft_model_id] = peft_dict + def download_hf_config(self): """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" - self.config_dir = os.path.join( + config_dir = os.path.join( os.path.expanduser(self.cache_path), "configs", self.model_name.lower() ) - self.config_path = os.path.join(self.config_dir, "config.json") - os.makedirs(self.config_dir, exist_ok=True) - print(f"Creating directory {self.config_dir} (if it doesn't exist)...") - print(f"Saving {self.model_name} configs to file {self.config_path}...") - self.hf_config.to_json_file(self.config_path) - - def __get_revision_hashes(self, model_name: str, weights: bool): + config_path = os.path.join(config_dir, "config.json") + os.makedirs(config_dir, exist_ok=True) + print(f"Creating directory {config_dir} (if it doesn't exist)...") + print(f"Saving {self.model_name} configs to file {config_path}...") + self.hf_config.to_json_file(config_path) + + # Save PEFT configs if the LLM has any registered PEFTs + for peft_model_id, peft_dict in self.pefts.items(): + peft_config = peft_dict["hf_config"] + peft_config_path = os.path.join(os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower()) + print(f"Saving {peft_model_id} configs to file {peft_config_path}...") + with open(peft_config_path, "w") as json_file: + class SetEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + return super().default(obj) + json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder) + + def __get_revision_hashes(self, model_name: str, folder: str): ff_revision = None - ff_revision_file = ( - os.path.join(self.weights_path, "rev_sha.txt") - if weights - else os.path.join(self.tokenizer_path, "rev_sha.txt") - ) + ff_revision_file = os.path.join(folder, "rev_sha.txt") + if os.path.exists(ff_revision_file): ff_revision = "".join(open(ff_revision_file).read().split()) @@ -180,46 +211,31 @@ def __get_revision_hashes(self, model_name: str, weights: bool): def download_hf_weights_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. If not, or if the refresh_cache parameter is set to True, download new weights. + + If any PEFT adapter is registered, perform the same operation for PEFT. """ - # Use local cache, or download new version - self.weights_path = os.path.join( - os.path.expanduser(self.cache_path), - "weights", - self.model_name.lower(), - ( - "full-precision" - if self.data_type == DataType.DT_FLOAT - else "half-precision" - ), - ) - if self.refresh_cache: - print( - f"Refreshing weights in cache for model {self.model_name} at path {self.weights_path} ..." + def get_weights_path(model_name): + return os.path.join(os.path.expanduser(self.cache_path), "weights", model_name.lower(), + ( + "full-precision" + if self.data_type == DataType.DT_FLOAT + else "half-precision" + ), ) - if os.path.exists(self.weights_path): - shutil.rmtree(self.weights_path) - os.makedirs(self.weights_path, exist_ok=True) - #print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, weights=True - ) - # Download if needed - if ff_revision != latest_revision: - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - # Local model + def refresh_cache_if_needed(model_name): + weights_path = get_weights_path(model_name) + if self.refresh_cache: print( - f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..." + f"Refreshing weights in cache for model {model_name} at path {weights_path} ..." ) - else: - # Remote model - print( - f"'{self.model_name}' local model weights were updated! Converting new weights now..." - ) - # Download model from HuggingFace, or load it from the local folder - hf_model = AutoModelForCausalLM.from_pretrained( - self.model_name, + if os.path.exists(weights_path): + shutil.rmtree(weights_path) + os.makedirs(weights_path, exist_ok=True) + + def get_hf_llm(model_name): + return AutoModelForCausalLM.from_pretrained( + model_name, trust_remote_code=True, torch_dtype=( torch.float32 @@ -227,21 +243,61 @@ def download_hf_weights_if_needed(self): else torch.float16 ), ) - # Print log message to notify user download of model has finished - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - print("Done downloading HF weights. Converting them now...") - # Convert the model to FlexFlow format - self.model_class.convert_hf_model(hf_model, self.weights_path) - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - print("Done converting the weights...") - # Deallocate hf model - del hf_model - gc.collect() - torch.cuda.empty_cache() - else: - print(f"Loading '{self.model_name}' model weights from the cache...") + + def download_llm_weights(): + weights_path = get_weights_path(self.model_name) + refresh_cache_if_needed(self.model_name) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights_path) + if ff_revision != latest_revision: + print(f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now...") + hf_model = get_hf_llm(self.model_name) + # Convert the model to FlexFlow format + self.model_class.convert_hf_model(hf_model, weights_path) + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + print(f"Done converting the weights for model {self.model_name}") + # Deallocate hf model + del hf_model + gc.collect() + torch.cuda.empty_cache() + + def convert_peft_model(hf_peft_model, peft_type, weights_path): + for name, params in hf_peft_model.named_parameters(): + if peft_type.lower() in name: + name = name.replace("base_model.model.model.", "").replace( + ".default", "" + ) + name = self.model_class.convert_hf_weight_name(name) + params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") + + def download_peft_weights(): + for peft_model_id, peft_dict in self.pefts.items(): + peft_config = peft_dict["peft_config"] + peft_type = peft_config["peft_type"] + + weights_path = get_weights_path(peft_model_id) + refresh_cache_if_needed(peft_model_id) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(peft_model_id, weights_path) + + if ff_revision != latest_revision: + print(f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now...") + hf_model = get_hf_llm(peft_model_id) + hf_peft_model = PeftModel.from_pretrained(hf_model, peft_model_id, config=peft_config) + # Convert the model to FlexFlow format + convert_peft_model(hf_peft_model, peft_type, weights_path) + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + print(f"Done converting the weights for model {peft_model_id}") + # Deallocate hf model + del hf_peft_model + del hf_model + gc.collect() + torch.cuda.empty_cache() + + download_llm_weights() + download_peft_weights() def download_hf_tokenizer_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date. @@ -250,37 +306,24 @@ def download_hf_tokenizer_if_needed(self): print("Loading tokenizer...") # Use local cache, or download new version - self.tokenizer_path = os.path.join( + tokenizer_path = os.path.join( os.path.expanduser(self.cache_path), "tokenizers", self.model_name.lower(), ) if self.refresh_cache: - print( - f"Discarding cached tokenizer files (if they exist) for model {self.model_name}..." - ) - if os.path.exists(self.tokenizer_path): - shutil.rmtree(self.tokenizer_path) - if not os.path.exists(self.tokenizer_path): - print(f"Creating directory {self.tokenizer_path} (if it doesn't exist)...") - os.makedirs(self.tokenizer_path, exist_ok=True) + print(f"Refreshing cached tokenizer for model {self.model_name} at path {tokenizer_path} ...") + if os.path.exists(tokenizer_path): + shutil.rmtree(tokenizer_path) + if not os.path.exists(tokenizer_path): + print(f"Creating directory {tokenizer_path} (if it doesn't exist)...") + os.makedirs(tokenizer_path, exist_ok=True) # Get local revision SHA, check if it matches latest one on huggingface - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, weights=False - ) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, tokenizer_path) if ff_revision != latest_revision: - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - # Local model - print( - f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..." - ) - else: - # Remote model - print( - f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..." - ) + print(f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now...") # Download tokenizer from HuggingFace, or load it from the local folder if self.model_type == ModelType.LLAMA: hf_tokenizer = LlamaTokenizer.from_pretrained( @@ -288,19 +331,13 @@ def download_hf_tokenizer_if_needed(self): ) else: hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name) - # Print log message to notify user download of tokenizer has finished - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - print("Done downloading tokenizer. Saving it now...") # Save tokenizer - hf_tokenizer.save_pretrained(self.tokenizer_path) - print("Done saving HF tokenizer.") + hf_tokenizer.save_pretrained(tokenizer_path) + print("Done updating HF tokenizer.") # Save new revision hash to file with open(ff_revision_file, "w+") as f: f.write(latest_revision) - else: - print(f"Loading '{self.model_name}' tokenizer from the cache...") - def compile( self, generation_config: GenerationConfig = GenerationConfig(), @@ -378,6 +415,12 @@ def compile( max_tokens_per_batch, ) + # Add PEFT layer if registered + for _, peft_dict in self.pefts.items(): + ff_peft_config = peft_dict["ff_peft_config"] + ff_peft_model_id = self.model.add_lora_layer(ff_peft_config) + peft_dict["ff_peft_model_id"] = ff_peft_model_id + # Download the weights from huggingface (if needed) self.download_hf_weights_if_needed() @@ -526,258 +569,3 @@ def compile( model_specific_pipeline_parallelism_degree, ssms, ) - - -class PEFT(LLM): - """This class creates a PEFT (parameter-efficient transformer) object to be used in concert with a LLM or SSM""" - - def __init__( - self, - base_model: LLM, - peft_model_id: str, - config: PeftConfig = None, - data_type: DataType = DataType.DT_HALF, - cache_path: str = "", - refresh_cache: bool = False, - ): - self.peft_model_id = peft_model_id - self.model_name = peft_model_id - self.hf_config = config if config is not None else PeftConfig.from_pretrained(peft_model_id) - self.peft_type = self.hf_config.peft_type - if self.peft_type != "LORA": - raise RuntimeError( - f"PEFT type {self.peft_type} not yet supported in FlexFlow" - ) - self.data_type = data_type - assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT - self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" - self.refresh_cache = refresh_cache - # Base model related - if "base_model_name_or_path" not in self.hf_config.to_dict(): - raise ValueError( - f"PEFT model {peft_model_id} does not have an associated based model" - ) - self.base_model = base_model - if refresh_cache: - self.base_model.refresh_cache = True - - def download_hf_config(self): - """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" - self.config_dir = os.path.join( - os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower() - ) - self.config_path = os.path.join(self.config_dir, "config.json") - os.makedirs(self.config_dir, exist_ok=True) - print(f"Creating directory {self.config_dir} (if it doesn't exist)...") - print(f"Saving {self.peft_model_id} configs to file {self.config_path}...") - with open(self.config_path, "w") as json_file: - - class SetEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, set): - return list(obj) - return super().default(obj) - - json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder) - - self.base_model.download_hf_config() - - def __get_revision_hashes(self, peft_model_id: str, weights: bool): - model_name = self.peft_model_id - return self._LLM__get_revision_hashes(model_name, weights) - - def convert_peft_model(self, hf_peft_model, weights_path): - for name, params in hf_peft_model.named_parameters(): - if self.peft_type.lower() in name: - name = name.replace("base_model.model.model.", "").replace( - ".default", "" - ) - name = self.base_model.model_class.convert_hf_weight_name(name) - params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") - - def download_hf_weights_if_needed(self): - """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date. - If not, or if the refresh_cache parameter is set to True, download new weights. - """ - self.base_model.download_hf_weights_if_needed() - - # Use local cache, or download new version - self.weights_path = os.path.join( - os.path.expanduser(self.cache_path), - "weights", - self.peft_model_id.lower(), - ( - "full-precision" - if self.data_type == DataType.DT_FLOAT - else "half-precision" - ), - ) - if self.refresh_cache: - print( - f"Refreshing weights in cache for model {self.peft_model_id} at path {self.weights_path} ..." - ) - if os.path.exists(self.weights_path): - shutil.rmtree(self.weights_path) - os.makedirs(self.weights_path, exist_ok=True) - #print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.peft_model_id, - True - ) - - # Download if needed - if ff_revision != latest_revision: - if not os.path.exists(self.peft_model_id) or os.path.isdir( - self.peft_model_id - ): - print( - f"'{self.peft_model_id}' model weights not found in cache or outdated. Downloading from huggingface.co ..." - ) - else: - print( - f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..." - ) - hf_base_model = AutoModelForCausalLM.from_pretrained( - self.base_model.model_name, - return_dict=True, - trust_remote_code=True, - torch_dtype=( - torch.float32 - if self.data_type == DataType.DT_FLOAT - else torch.float16 - ), - # device_map="auto", - ) - hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id, config=self.hf_config) - # Print log message to notify user download of model has finished - if not os.path.exists(self.peft_model_id) or os.path.isdir( - self.peft_model_id - ): - print("Done downloading HF weights. Converting them now...") - # Convert the model to FlexFlow format - self.convert_peft_model(hf_peft_model, self.weights_path) - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - print("Done converting the weights...") - # Deallocate hf model - del hf_peft_model - del hf_base_model - gc.collect() - torch.cuda.empty_cache() - else: - print(f"Loading '{self.peft_model_id}' model weights from the cache...") - - def download_hf_tokenizer_if_needed(self): - self.base_model.download_hf_tokenizer_if_needed() - - def compile( - self, - generation_config: GenerationConfig = GenerationConfig(), - max_requests_per_batch: int = 1, - max_seq_length: int = 256, - max_tokens_per_batch: int = 64, - model_specific_data_parallelism_degree: int = None, - model_specific_tensor_parallelism_degree: int = None, - model_specific_pipeline_parallelism_degree: int = None, - ssms: list = [], - ): - self.base_model.ssms = ssms - self.base_model.generation_config = GenerationConfig() - self.base_model.ffconfig = FFConfig() - if len(ssms) > 0: - assert type(self.base_model) == LLM - mode = InferenceMode.TREE_VERIFY_MODE - elif type(self.base_model) == SSM: - mode = InferenceMode.BEAM_SEARCH_MODE - else: - assert type(self.base_model) == LLM - mode = InferenceMode.INC_DECODING_MODE - - # Apply model-specific parallelism degrees, if needed - if model_specific_data_parallelism_degree: - self.base_model.ffconfig.data_parallelism_degree = ( - model_specific_data_parallelism_degree - ) - if model_specific_tensor_parallelism_degree: - self.base_model.ffconfig.tensor_parallelism_degree = ( - model_specific_tensor_parallelism_degree - ) - if model_specific_pipeline_parallelism_degree: - self.base_model.ffconfig.pipeline_parallelism_degree = ( - model_specific_pipeline_parallelism_degree - ) - - # Create request manager and set serving configuration - self.base_model.rm = RequestManager() - self.base_model.rm.set_max_requests_per_batch(max_requests_per_batch) - self.base_model.rm.set_max_tokens_per_batch(max_tokens_per_batch) - self.base_model.rm.set_max_sequence_length(max_seq_length) - - # Instantiate the relevant model - self.base_model.model = self.model_class( - mode, - generation_config, - self.base_model.ffconfig, - self.base_model.hf_config, - self.base_model.data_type, - max_tokens_per_batch, - ) - - # TODO: add peft layers - - # Download the weights from huggingface (if needed) - self.download_hf_weights_if_needed() - - # Create file data loader, load weights into tensors - model_configs = self.base_model.config_class(self.base_model.hf_config) - - self.fileloader = FileDataLoader( - self.weights_path, - model_configs.num_attention_heads, - model_configs.num_key_value_heads, - model_configs.hidden_size, - model_configs.hidden_size // model_configs.num_attention_heads, - self.ffconfig.tensor_parallelism_degree, - self.data_type == DataType.DT_FLOAT, - ) - - # Register weights file loader - self.im = InferenceManager() - self.im.register_model_weights_loader(self.model.ffmodel, self.fileloader) - - # Download the tokenizer from huggingface (if needed) and load them - self.download_hf_tokenizer_if_needed() - - # Create tokenizer (this must be done after we have downloaded the tokenizer - bos_token_id = ( - -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id - ) - eos_token_id = ( - -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id - ) - self.rm.register_tokenizer( - self.model_type, bos_token_id, eos_token_id, self.tokenizer_path - ) - self.rm.register_output_filepath(self.output_file) - - for ssm in self.ssms: - self.rm.register_ssm_model(ssm.model.ffmodel) - - # start background server - if (mode == InferenceMode.TREE_VERIFY_MODE) or ( - mode == InferenceMode.INC_DECODING_MODE - ): - import atexit - - atexit.register(self.rm.stop_server) - - def generate(self, prompts: Union[str, List[str]], max_length: int = 128): - super().generate(prompts, max_length) - - def start_server(self): - self.base_model.start_server() - - def stop_server(self): - self.base_model.stop_server() diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 58acf3d010..60e33beb5e 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -67,6 +67,8 @@ class FFCObjectWrapper { FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *); FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *); FF_NEW_OPAQUE_WRAPPER(flexflow_generation_result_t, GenerationResult *); + FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *); + FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *); }; Logger ffc_log("flexflow_c"); @@ -1542,6 +1544,21 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } +flexflow_peft_model_id_t flexflow_model_add_lora_layer( + flexflow_model_t handle_, + const flexflow_lora_linear_config_t peft_config_) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_); + PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config); + + DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, " + "peft_model_id: %p", + handle, + peft_config, + peft_model_id); + return FFCObjectWrapper::wrap(peft_model_id); +} + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_, flexflow_sgd_optimizer_t optimizer_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -2739,3 +2756,50 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, FFModel *model = FFCObjectWrapper::unwrap(model_handle_); handle->load_weights(model); } + +// ----------------------------------------------------------------------- +// LoraLinearConfig +// ----------------------------------------------------------------------- + +flexflow_lora_linear_config_t + flexflow_lora_linear_config_create(char const *cache_folder_, + char const *peft_model_id_) { + assert(cache_folder_ != nullptr && + "Cannot convert nullptr char * to std::string"); + assert(peft_model_id_ != nullptr && + "Cannot convert nullptr char * to std::string"); + std::string const cache_folder(cache_folder_); + std::string const peft_model_id(peft_model_id_); + LoraLinearConfig *handle = new LoraLinearConfig(cache_folder, peft_model_id); + DEBUG_PRINT("[LoraLinearConfig] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +void flexflow_lora_linear_config_destroy( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *peft_config = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[LoraLinearConfig] delete %p", peft_config); + delete peft_config; +} + +// ----------------------------------------------------------------------- +// PEFTModelID +// ----------------------------------------------------------------------- + +flexflow_peft_model_id_t flexflow_peft_model_id_create() { + PEFTModelID *handle = new PEFTModelID(); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) { + PEFTModelID *handle = new PEFTModelID(id); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) { + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id); + delete peft_model_id; +} diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 39934f4cce..170e087226 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -51,15 +51,15 @@ bool check_lora_layer_match(Layer *potential_target, return false; } -PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { +PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); if (peft_config.target_modules.size() == 0) { printf("PEFT config does not contain any target module\n"); - return PEFTModelID::NO_ID; + return nullptr; } - PEFTModelID peft_model_id(peft_model_global_guid++); - peft_configs[peft_model_id] = peft_config; + PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++); + peft_configs[*peft_model_id] = peft_config; for (std::string target_module_name : peft_config.target_modules) { assert(target_module_name.length() > 0 && @@ -76,7 +76,7 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { base_layer_to_peft_layer.end()) { // lora linear layer already added, no need to add again Layer *peft_layer = base_layer_to_peft_layer[target_module]; - peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); + peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); } else { Tensor const input = target_module->inputs[0]; Tensor const output = target_module->outputs[0]; @@ -124,7 +124,7 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) { ++it; base_layer_to_peft_layer[target_module] = peft_layer; peft_layer_to_peft_id[peft_layer] = std::vector(); - peft_layer_to_peft_id[peft_layer].push_back(peft_model_id); + peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); } } } From 66573788643a0c669aff167ea8456c3f6099038c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 24 Mar 2024 22:50:49 +0000 Subject: [PATCH 25/32] update --- include/flexflow/ffconst.h | 5 ++ include/flexflow/request_manager.h | 3 +- inference/peft/peft.cc | 47 ++++++------ python/flexflow/core/flexflow_cffi.py | 103 ++++++++++++++++++++++++-- python/flexflow/serve/serve.py | 45 ++--------- python/flexflow/type.py | 3 + src/c/flexflow_c.cc | 80 ++++++++++++++------ src/runtime/request_manager.cc | 22 ++++-- 8 files changed, 207 insertions(+), 101 deletions(-) diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 66e252db46..b16b9f9230 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -78,6 +78,11 @@ enum InferenceMode { TREE_VERIFY_MODE = 2003, }; +enum RequestType { + REQ_INFERENCE = 4001, + REQ_FINETUNING = 4002, +}; + // This is consistent with TASO's OpType // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138 enum OperatorType { diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 0e59888888..0ef5efcf27 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -65,7 +65,6 @@ struct Request { COMPLETED = 103, // finished and verified FINISHING = 104, // finishing request, but not yet verified }; - enum RequestType { REQ_INFERENCE = 201, REQ_FINETUNING = 202 }; BatchConfig::RequestGuid guid; PEFTModelID peft_model_id = PEFTModelID::NO_ID; int max_sequence_length = 128; @@ -81,7 +80,7 @@ struct Request { RequestType req_type = REQ_INFERENCE; int completed_training_steps = 0; int max_training_steps = 1; - std::vector> dataset_text; + std::string dataset_filepath; std::vector, std::vector>> dataset; diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index aa5581ca87..687cd92699 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -291,6 +291,9 @@ void FlexFlow::top_level_task(Task const *task, int total_num_requests = 0; { + std::vector requests; + + // Add inference requests using json = nlohmann::json; std::ifstream file_handle(file_paths.prompt_file_path); assert(file_handle.good() && "Prompt file does not exist."); @@ -298,28 +301,28 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); - - std::vector requests; - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - // Add inference request - // Request inference_req; - // inference_req.prompt = text; - // inference_req.max_sequence_length = 128; - // inference_req.peft_model_id = peft_model_id; - // requests.push_back(inference_req); - // total_num_requests++; - // Add fine-tuning request - Request fine_tuning_req; - fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; - fine_tuning_req.max_sequence_length = 128; - fine_tuning_req.peft_model_id = - (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; - fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); - requests.push_back(fine_tuning_req); - total_num_requests++; - } + // for (auto &prompt : prompt_json) { + // std::string text = prompt.get(); + // printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + // Request inference_req; + // inference_req.prompt = text; + // inference_req.max_sequence_length = 128; + // inference_req.peft_model_id = peft_model_id; + // requests.push_back(inference_req); + // total_num_requests++; + // } + + // Add fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = 128; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.dataset_filepath = file_paths.prompt_file_path; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + total_num_requests++; + std::vector result = model.generate(requests); } diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index ef0ee0e378..ccb50dd566 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -28,6 +28,7 @@ CompMode, MetricsType, InferenceMode, + RequestType, ModelType, OpType, ParameterSyncType, @@ -36,7 +37,7 @@ ) from flexflow.config import * from .flexflowlib import ffi, flexflow_library - +from typing import Union, List def ffc(): if not flexflow_already_initialized(): @@ -3823,27 +3824,57 @@ def get_output_tensor(self, ffmodel, data_type): assert ret_val == True return np_array - def generate(self, prompt_list, max_sequence_length): + def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128): assert isinstance(prompt_list, list) c_input_texts = [get_c_name(prompt) for prompt in prompt_list] max_num_chars = 5 * (max_sequence_length + 100) c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list] c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list] + c_request_types = [enum_to_int(RequestType, RequestType.REQ_INFERENCE) for prompt in prompt_list] + max_sequence_lengths = [max_sequence_length for prompt in prompt_list] + peft_model_ids = [None for prompt in prompt_list] + dataset_filepaths = [None for prompt in prompt_list] + training_steps = [0 for prompt in prompt_list] ffc().flexflow_model_generate( self.handle, len(prompt_list), + c_request_types, c_input_texts, max_num_chars, c_output_texts, - max_sequence_length, + max_sequence_lengths, + peft_model_ids, + dataset_filepaths, + training_steps, c_output_length_and_tokens, ) - #output_length = c_output_length_and_tokens[0] - #output_tokens = [] - #for i in range(output_length): - # output_tokens.append(c_output_length_and_tokens[i + 1]) from flexflow.serve import GenerationResult - + return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts] + + def generate(self, requests_list: List[Request]): + assert isinstance(requests_list, list) + c_input_texts = [get_c_name(request.prompt) for request in requests_list] + max_num_chars = 5 * (max_sequence_length + 100) + c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list] + c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list] + c_request_types = [enum_to_int(RequestType, RequestType.REQ_INFERENCE) for prompt in prompt_list] + max_sequence_lengths = [max_sequence_length for prompt in prompt_list] + peft_model_ids = [None for prompt in prompt_list] + dataset_filepaths = [None for prompt in prompt_list] + training_steps = [0 for prompt in prompt_list] + ffc().flexflow_model_generate( + self.handle, + len(prompt_list), + c_request_types, + c_input_texts, + max_num_chars, + c_output_texts, + max_sequence_lengths, + peft_model_ids, + dataset_filepaths, + training_steps, + c_output_length_and_tokens, + ) return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts] def set_position_offset(self, offset): @@ -4288,6 +4319,47 @@ def load_weights(self, model): self.handle, model.handle ) +# ----------------------------------------------------------------------- +# GenerationConfig +# ----------------------------------------------------------------------- + +class GenerationConfig(object): + """A class to store the sampling configs.""" + + def __init__( + self, + do_sample: bool = False, + temperature: float = 0.9, + topp: float = 0.8, + topk: int = 1, + ): + """Initialize the sampling configs + + :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False + :type do_sample: bool, optional + :param temperature: The temperature setting, defaults to 0.9 + :type temperature: float, optional + :param topp: The top probabilities (top-p) setting, defaults to 0.8 + :type topp: float, optional + :param topk: The top-k setting, defaults to 1 + :type topk: int, optional + """ + self.do_sample = do_sample + self.temperature = temperature + self.topp = topp + self.topk = topk + +# ----------------------------------------------------------------------- +# GenerationResult +# ----------------------------------------------------------------------- + +class GenerationResult(object): + """A class to store the output of a generation request.""" + + def __init__(self, text: str = None, tokens: list = None): + self.output_text = text + self.output_tokens = tokens + # ----------------------------------------------------------------------- # LoraLinearConfig # ----------------------------------------------------------------------- @@ -4321,3 +4393,18 @@ def __init__(self, id=None): else: self.handle = ffc().flexflow_peft_model_id_create_id(id) self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy) + +# ----------------------------------------------------------------------- +# Request +# ----------------------------------------------------------------------- + +class Request: + """A class to record the metadata of an inference or finetuning request.""" + + def __init__(self, req_type: RequestType, prompt: str = None, max_sequence_length: int = None, peft_model_id: PEFTModelID = None, dataset_filepath: str = None, max_training_steps: int = None): + self.req_type = req_type + self.prompt = prompt + self.max_sequence_length = max_sequence_length + self.peft_model_id = peft_model_id + self.dataset_filepath = dataset_filepath + self.max_training_steps = max_training_steps \ No newline at end of file diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index bc7a796315..b38f0b574f 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -34,41 +34,6 @@ from typing import Union, List -class GenerationConfig: - """A class to store the sampling configs.""" - - def __init__( - self, - do_sample: bool = False, - temperature: float = 0.9, - topp: float = 0.8, - topk: int = 1, - ): - """Initialize the sampling configs - - :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False - :type do_sample: bool, optional - :param temperature: The temperature setting, defaults to 0.9 - :type temperature: float, optional - :param topp: The top probabilities (top-p) setting, defaults to 0.8 - :type topp: float, optional - :param topk: The top-k setting, defaults to 1 - :type topk: int, optional - """ - self.do_sample = do_sample - self.temperature = temperature - self.topp = topp - self.topk = topk - - -class GenerationResult: - """A class to store the output of a generation request.""" - - def __init__(self, text: str = None, tokens: list = None): - self.output_text = text - self.output_tokens = tokens - - class _SupportedModels: def __init__( self, @@ -467,22 +432,22 @@ def compile( atexit.register(self.rm.stop_server) - def generate(self, prompts: Union[str, List[str]], max_length: int = 128): + def generate(self, prompts: Union[str, List[str], Request, List[Request]], max_length: int = 128): """Generate tokens based on the input prompt(s) - :param prompts: The generation prompt(s) in the form of a string, or list of strings - :type prompts: Union[str, List[str]] + :param prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests + :type prompts: Union[str, List[str], Request, List[Request]] :return: the generation results :rtype: GenerationResult """ if type(prompts) == str: if len(prompts) == 0: return None - return self.model.ffmodel.generate([prompts], max_length) + return self.model.ffmodel.generate_inf_only([prompts], max_length) elif type(prompts) == list: if len(prompts) == 0: return [] - return self.model.ffmodel.generate(prompts, max_length) + return self.model.ffmodel.generate_inf_only(prompts, max_length) else: assert False, "Please pass a non-empty string or list of strings" diff --git a/python/flexflow/type.py b/python/flexflow/type.py index 994a85f57e..ac6975b4fd 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -152,6 +152,9 @@ class OpType(Enum): RESIDUAL_RMS_NORM = 2305 RESIDUAL_LAYERNORM = 2306 +class RequestType(Enum): + REQ_INFERENCE = 4001 + REQ_FINETUNING = 4002 def enum_to_int(enum, enum_item): for item in enum: diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 60e33beb5e..d592cdd3ee 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1614,43 +1614,75 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { void flexflow_model_generate(flexflow_model_t handle_, int num_requests, + enum RequestType *request_types, char const **input_texts, int max_num_chars, char **output_texts, - int max_seq_length, + int *max_seq_lengths, + flexflow_peft_model_id_t *peft_model_ids, + char const **dataset_filepaths, + int *training_steps; int **output_length_and_tokens) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); std::vector requests; + + int finetuning_req_idx = 0; for (int i = 0; i < num_requests; i++) { - std::string const text_str(input_texts[i]); - Request inference_req; - inference_req.prompt = text_str; - inference_req.max_sequence_length = max_seq_length; - requests.push_back(inference_req); - DEBUG_PRINT("[Model] generate[%d] %p %s %i", - i, - handle, - text_str.c_str(), - max_seq_length); + if (request_types[i] == RequestType::REQ_INFERENCE) { + std::string const text_str(input_texts[i]); + Request inference_req; + inference_req.prompt = text_str; + inference_req.max_sequence_length = max_seq_lengths[i]; + if (peft_model_ids[i] != nullptr) { + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + inference_req.peft_model_id = *peft_model_id; + } + requests.push_back(inference_req); + DEBUG_PRINT("[Model] generate[%d] %p %s %i", + i, + handle, + text_str.c_str(), + max_seq_lengths[i]); + } else { + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = max_seq_lengths[i]; + if (peft_model_ids[i] != nullptr) { + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + fine_tuning_req.peft_model_id = *peft_model_id; + } + std::string const dataset_fp(dataset_filepaths[finetuning_req_idx]); + fine_tuning_req.dataset_filepath = dataset_fp; + fine_tuning_req.max_training_steps = training_steps[finetuning_req_idx]; + requests.push_back(finetuning_req_idx); + DEBUG_PRINT("[Model] generate[%d] %p %s %i %i", + i, + handle, + dataset_fp.c_str(), + max_seq_lengths[i], + training_steps[finetuning_req_idx]); + finetuning_req_idx++; + } } std::vector results = handle->generate(requests); - // If the prompt exceeds max seq len, check that we return the prompt with no - // additional token. Otherwise, check that the output does not exceed the max - // sequence length. for (int i = 0; i < num_requests; i++) { - assert(results[i].output_tokens.size() <= max_seq_length || - results[i].output_tokens.size() == results[i].input_tokens.size()); - output_length_and_tokens[i][0] = results[i].output_tokens.size(); - std::copy(results[i].output_tokens.begin(), - results[i].output_tokens.end(), - output_length_and_tokens[i] + 1); - std::memcpy(output_texts[i], - results[i].output_text.c_str(), - results[i].output_text.length()); + if (request_types[i] == RequestType::REQ_INFERENCE) { + // If the prompt exceeds max seq len, check that we return the prompt with no + // additional token. Otherwise, check that the output does not exceed the max + // sequence length. + assert(results[i].output_tokens.size() <= max_seq_length || + results[i].output_tokens.size() == results[i].input_tokens.size()); + output_length_and_tokens[i][0] = results[i].output_tokens.size(); + std::copy(results[i].output_tokens.begin(), + results[i].output_tokens.end(), + output_length_and_tokens[i] + 1); + std::memcpy(output_texts[i], + results[i].output_text.c_str(), + results[i].output_text.length()); + } } - // return FFCObjectWrapper::wrap(&results[0]); } void flexflow_model_set_position_offset(flexflow_model_t handle_, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index c335cd246b..8fb040fb6d 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -244,15 +244,27 @@ RequestManager::RequestGuid request.peft_model_id = request_.peft_model_id; request.req_type = Request::REQ_FINETUNING; request.completed_training_steps = 0; - request.max_training_steps = 1; // TODO: let user set this - for (auto const &sample : request_.dataset_text) { + request.max_training_steps = request_.max_training_steps; + request.dataset_filepath = request_.dataset_filepath; + + // Load dataset + using json = nlohmann::json; + std::ifstream file_handle(request.dataset_filepath); + assert(file_handle.good() && "Dataset file does not exist."); + json dataset_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + for (auto &prompt : dataset_json) { + std::string text = prompt.get(); + std::string output_text(""); std::vector input_tokens; - input_tokens = this->tokenizer_->Encode(sample.first); + input_tokens = this->tokenizer_->Encode(text); if (bos_token_id >= 0 && model_type != ModelType::FALCON) { input_tokens.insert(input_tokens.begin(), bos_token_id); } - std::vector output_tokens = - this->tokenizer_->Encode(sample.second); + std::vector output_tokens = this->tokenizer_->Encode(output_text); if (input_tokens.size() + output_tokens.size() > get_max_sequence_length()) { std::cout << "Warning: too many tokens in sample, only load up to " From 7b8a9ee31cd0e51150a13251269689307feb925c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 25 Mar 2024 00:51:27 +0000 Subject: [PATCH 26/32] update --- python/flexflow/core/flexflow_cffi.py | 23 ++++++++++------------- src/c/flexflow_c.cc | 1 - 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index ccb50dd566..981f2be9ef 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -3840,7 +3840,6 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 1 len(prompt_list), c_request_types, c_input_texts, - max_num_chars, c_output_texts, max_sequence_lengths, peft_model_ids, @@ -3853,21 +3852,19 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 1 def generate(self, requests_list: List[Request]): assert isinstance(requests_list, list) - c_input_texts = [get_c_name(request.prompt) for request in requests_list] - max_num_chars = 5 * (max_sequence_length + 100) - c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list] - c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list] - c_request_types = [enum_to_int(RequestType, RequestType.REQ_INFERENCE) for prompt in prompt_list] - max_sequence_lengths = [max_sequence_length for prompt in prompt_list] - peft_model_ids = [None for prompt in prompt_list] - dataset_filepaths = [None for prompt in prompt_list] - training_steps = [0 for prompt in prompt_list] + c_input_texts = [get_c_name(request.prompt) for request in requests_list] # entry will be None for finetuning requests + c_output_texts = [ffi.new("char[]", 5 * (request.max_sequence_length + 100)) if request.req_type == RequestType.REQ_INFERENCE else ffi.NULL for request in requests_list] + c_output_length_and_tokens = [ffi.new("int[]", request.max_sequence_length + 100) for request in requests_list] + c_request_types = [enum_to_int(RequestType, request.req_type) for request in requests_list] + max_sequence_lengths = [request.max_sequence_length for request in requests_list] + peft_model_ids = [request.peft_model_id for request in requests_list] + dataset_filepaths = [request.dataset_filepath for request in requests_list] + training_steps = [request.max_training_steps for request in requests_list] ffc().flexflow_model_generate( self.handle, - len(prompt_list), + len(requests_list), c_request_types, c_input_texts, - max_num_chars, c_output_texts, max_sequence_lengths, peft_model_ids, @@ -3875,7 +3872,7 @@ def generate(self, requests_list: List[Request]): training_steps, c_output_length_and_tokens, ) - return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts] + return [GenerationResult(ffi.string(c_output_text), []) if c_output_text != ffi.NULL else None for c_output_text in c_output_texts] def set_position_offset(self, offset): ffc().flexflow_model_set_position_offset(self.handle, offset) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index d592cdd3ee..44fdd5af4e 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1616,7 +1616,6 @@ void flexflow_model_generate(flexflow_model_t handle_, int num_requests, enum RequestType *request_types, char const **input_texts, - int max_num_chars, char **output_texts, int *max_seq_lengths, flexflow_peft_model_id_t *peft_model_ids, From 22d4d8ef112ef5291fb78fb2e523027c541df84b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 25 Mar 2024 03:43:40 +0000 Subject: [PATCH 27/32] update --- python/flexflow/serve/serve.py | 99 ++++++++++++++++++++++++---------- 1 file changed, 70 insertions(+), 29 deletions(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index b38f0b574f..a02facc356 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -120,7 +120,9 @@ def add_peft(self, peft_model_id: str): f"PEFT model {peft_model_id} does not have an associated base model" ) if peft_config.base_model_name_or_path != self.model_name: - raise RuntimeError(f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}") + raise RuntimeError( + f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}" + ) ff_peft_config = LoraLinearConfig(self.cache_path, peft_model_id) peft_dict = { "peft_config": peft_config, @@ -139,24 +141,30 @@ def download_hf_config(self): print(f"Creating directory {config_dir} (if it doesn't exist)...") print(f"Saving {self.model_name} configs to file {config_path}...") self.hf_config.to_json_file(config_path) - + # Save PEFT configs if the LLM has any registered PEFTs for peft_model_id, peft_dict in self.pefts.items(): peft_config = peft_dict["hf_config"] - peft_config_path = os.path.join(os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower()) + peft_config_path = os.path.join( + os.path.expanduser(self.cache_path), + "configs", + self.peft_model_id.lower(), + ) print(f"Saving {peft_model_id} configs to file {peft_config_path}...") with open(peft_config_path, "w") as json_file: + class SetEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, set): return list(obj) return super().default(obj) + json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder) def __get_revision_hashes(self, model_name: str, folder: str): ff_revision = None ff_revision_file = os.path.join(folder, "rev_sha.txt") - + if os.path.exists(ff_revision_file): ff_revision = "".join(open(ff_revision_file).read().split()) @@ -179,8 +187,12 @@ def download_hf_weights_if_needed(self): If any PEFT adapter is registered, perform the same operation for PEFT. """ + def get_weights_path(model_name): - return os.path.join(os.path.expanduser(self.cache_path), "weights", model_name.lower(), + return os.path.join( + os.path.expanduser(self.cache_path), + "weights", + model_name.lower(), ( "full-precision" if self.data_type == DataType.DT_FLOAT @@ -197,7 +209,7 @@ def refresh_cache_if_needed(model_name): if os.path.exists(weights_path): shutil.rmtree(weights_path) os.makedirs(weights_path, exist_ok=True) - + def get_hf_llm(model_name): return AutoModelForCausalLM.from_pretrained( model_name, @@ -208,13 +220,17 @@ def get_hf_llm(model_name): else torch.float16 ), ) - + def download_llm_weights(): weights_path = get_weights_path(self.model_name) refresh_cache_if_needed(self.model_name) - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights_path) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + self.model_name, weights_path + ) if ff_revision != latest_revision: - print(f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now...") + print( + f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..." + ) hf_model = get_hf_llm(self.model_name) # Convert the model to FlexFlow format self.model_class.convert_hf_model(hf_model, weights_path) @@ -226,7 +242,7 @@ def download_llm_weights(): del hf_model gc.collect() torch.cuda.empty_cache() - + def convert_peft_model(hf_peft_model, peft_type, weights_path): for name, params in hf_peft_model.named_parameters(): if peft_type.lower() in name: @@ -235,20 +251,26 @@ def convert_peft_model(hf_peft_model, peft_type, weights_path): ) name = self.model_class.convert_hf_weight_name(name) params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") - + def download_peft_weights(): for peft_model_id, peft_dict in self.pefts.items(): peft_config = peft_dict["peft_config"] peft_type = peft_config["peft_type"] - + weights_path = get_weights_path(peft_model_id) refresh_cache_if_needed(peft_model_id) - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(peft_model_id, weights_path) - + ff_revision, ff_revision_file, latest_revision = ( + self.__get_revision_hashes(peft_model_id, weights_path) + ) + if ff_revision != latest_revision: - print(f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now...") + print( + f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..." + ) hf_model = get_hf_llm(peft_model_id) - hf_peft_model = PeftModel.from_pretrained(hf_model, peft_model_id, config=peft_config) + hf_peft_model = PeftModel.from_pretrained( + hf_model, peft_model_id, config=peft_config + ) # Convert the model to FlexFlow format convert_peft_model(hf_peft_model, peft_type, weights_path) # Save new revision hash to file @@ -260,7 +282,7 @@ def download_peft_weights(): del hf_model gc.collect() torch.cuda.empty_cache() - + download_llm_weights() download_peft_weights() @@ -277,7 +299,9 @@ def download_hf_tokenizer_if_needed(self): self.model_name.lower(), ) if self.refresh_cache: - print(f"Refreshing cached tokenizer for model {self.model_name} at path {tokenizer_path} ...") + print( + f"Refreshing cached tokenizer for model {self.model_name} at path {tokenizer_path} ..." + ) if os.path.exists(tokenizer_path): shutil.rmtree(tokenizer_path) if not os.path.exists(tokenizer_path): @@ -285,10 +309,14 @@ def download_hf_tokenizer_if_needed(self): os.makedirs(tokenizer_path, exist_ok=True) # Get local revision SHA, check if it matches latest one on huggingface - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, tokenizer_path) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + self.model_name, tokenizer_path + ) if ff_revision != latest_revision: - print(f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now...") + print( + f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..." + ) # Download tokenizer from HuggingFace, or load it from the local folder if self.model_type == ModelType.LLAMA: hf_tokenizer = LlamaTokenizer.from_pretrained( @@ -432,22 +460,35 @@ def compile( atexit.register(self.rm.stop_server) - def generate(self, prompts: Union[str, List[str], Request, List[Request]], max_length: int = 128): + def generate( + self, + requests_or_prompts: Union[str, List[str], Request, List[Request]], + max_length: int = 128, + ): """Generate tokens based on the input prompt(s) - :param prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests - :type prompts: Union[str, List[str], Request, List[Request]] + :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests + :type requests_or_prompts: Union[str, List[str], Request, List[Request]] :return: the generation results :rtype: GenerationResult """ - if type(prompts) == str: - if len(prompts) == 0: + if type(requests_or_prompts) == str: + if len(requests_or_prompts) == 0: return None - return self.model.ffmodel.generate_inf_only([prompts], max_length) - elif type(prompts) == list: - if len(prompts) == 0: + return self.model.ffmodel.generate_inf_only( + [requests_or_prompts], max_length + ) + elif type(requests_or_prompts) == Request: + return self.model.ffmodel.generate(requests_or_prompts) + elif type(requests_or_prompts) == list: + if len(requests_or_prompts) == 0: return [] - return self.model.ffmodel.generate_inf_only(prompts, max_length) + if type(requests_or_prompts[0]) == str: + return self.model.ffmodel.generate_inf_only( + requests_or_prompts, max_length + ) + else: + return self.model.ffmodel.generate(requests_or_prompts) else: assert False, "Please pass a non-empty string or list of strings" From dd971e778c766651d1c3bd76a44a4845811145a3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 25 Mar 2024 19:19:30 +0000 Subject: [PATCH 28/32] updates --- include/flexflow/ffconst.h | 2 +- include/flexflow/flexflow_c.h | 11 +- inference/peft/peft.cc | 6 +- inference/python/ff_peft.py | 146 + python/flexflow/core/flexflow_cffi.py | 4862 +++++++++++++------------ python/flexflow/serve/__init__.py | 11 +- python/flexflow/serve/serve.py | 6 +- src/c/flexflow_c.cc | 24 +- src/runtime/request_manager.cc | 20 +- tests/peft_test.sh | 4 + 10 files changed, 2660 insertions(+), 2432 deletions(-) create mode 100644 inference/python/ff_peft.py diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index b16b9f9230..016dd7bdd1 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -78,7 +78,7 @@ enum InferenceMode { TREE_VERIFY_MODE = 2003, }; -enum RequestType { +enum RequestType { REQ_INFERENCE = 4001, REQ_FINETUNING = 4002, }; diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 1ceea59839..8150e05dd1 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -618,10 +618,13 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id); void flexflow_model_generate(flexflow_model_t handle_, int num_requests, - char const **input_text, - int max_num_chars, - char **output_text, - int max_seq_length, + enum RequestType *request_types, + char const **input_texts, + char **output_texts, + int *max_seq_lengths, + flexflow_peft_model_id_t *peft_model_ids, + char const **dataset_filepaths, + int *training_steps, int **output_length_and_tokens); void flexflow_model_set_position_offset(flexflow_model_t handle, int offset); diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index 687cd92699..eade2eaeeb 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -292,7 +292,7 @@ void FlexFlow::top_level_task(Task const *task, int total_num_requests = 0; { std::vector requests; - + // Add inference requests using json = nlohmann::json; std::ifstream file_handle(file_paths.prompt_file_path); @@ -311,7 +311,7 @@ void FlexFlow::top_level_task(Task const *task, // requests.push_back(inference_req); // total_num_requests++; // } - + // Add fine-tuning request Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; @@ -322,7 +322,7 @@ void FlexFlow::top_level_task(Task const *task, fine_tuning_req.max_training_steps = 1; requests.push_back(fine_tuning_req); total_num_requests++; - + std::vector result = model.generate(requests); } diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py new file mode 100644 index 0000000000..18ef8bbf33 --- /dev/null +++ b/inference/python/ff_peft.py @@ -0,0 +1,146 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default="", + ) + args = parser.parse_args() + + # Load configs from JSON file (if specified) + if len(args.config_file) > 0: + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + model_configs = { + # required parameters + "base_model": "JackFram/llama-160m", + "peft_model_ids": [ + "goliaro/llama-160m-lora-full", + ], + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "finetuning_dataset": os.path.join( + os.path.dirname(os.path.abspath(__file__)), "../prompt/peft.json" + ), + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(model_configs) + return ff_init_configs + + +def main(): + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.base_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + for peft_model_id in configs.peft_model_ids: + llm.add_peft(peft_model_id) + + # Compile the LLM for inference and load the weights into memory + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + + llm.start_server() + + requests = [] + # Serving + if len(configs.prompt) > 0: + prompts = [s for s in json.load(open(configs.prompt))] + inference_requests = [ + Request(RequestType.REQ_INFERENCE, prompt=prompt, max_sequence_length=128) + for prompt in prompts + ] + requests += inference_requests + # Finetuning + if len(configs.finetuning_dataset) > 0: + for peft_model_id in configs.peft_model_ids: + finetuning_request = Request( + RequestType.REQ_FINETUNING, + max_sequence_length=128, + peft_model_id=peft_model_id, + dataset_filepath=configs.finetuning_dataset, + ) + requests.append(finetuning_request) + + llm.generate(requests) + + llm.stop_server() + + +if __name__ == "__main__": + print("flexflow PEFT example") + main() diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 981f2be9ef..aa762fc1af 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -39,6 +39,7 @@ from .flexflowlib import ffi, flexflow_library from typing import Union, List + def ffc(): if not flexflow_already_initialized(): raise RuntimeError("Cannot use FlexFlow library before initializing FlexFlow") @@ -1244,650 +1245,646 @@ def get_weights(self, ffmodel): # ----------------------------------------------------------------------- -# FFModel +# SGDOptimizer # ----------------------------------------------------------------------- -class FFModel(object): - """ """ +class SGDOptimizer(object): + __slots__ = ["handle", "_handle"] - __slots__ = [ - "handle", - "_handle", - "_layers", - "_nb_layers", - "_ffconfig", - "_tracing_id", - "initializers", - "attr_tensors", - ] + def __init__( + self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0 + ): + self.handle = ffc().flexflow_sgd_optimizer_create( + ffmodel.handle, lr, momentum, nesterov, weight_decay + ) + self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy) - def __init__(self, ffconfig): - """Constructor of FFModel. + def set_learning_rate(self, learning_rate): + ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate) - :param ffconfig: configurations of FlexFlow and the created model. - :type ffconfig: FFConfig - :returns: FFModel -- the model. - """ - self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) - self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy) - self._layers = dict() - self._nb_layers = 0 - self._ffconfig = ffconfig - global ff_tracing_id - self._tracing_id = ff_tracing_id - ff_tracing_id += 1 - self.initializers = {} - self.attr_tensors = {} +# ----------------------------------------------------------------------- +# AdamOptimizer +# ----------------------------------------------------------------------- - def get_layers(self): - return self._layers - def add_layer(self, op_type, name): - layer_id = self._nb_layers - op_handle = ffc().flexflow_model_get_last_layer(self.handle) - self._layers[self._nb_layers] = convert_op_handle_to_op( - op_type, op_handle, idx=layer_id, name=name +class AdamOptimizer(object): + __slots__ = ["handle", "_handle"] + + def __init__( + self, + ffmodel, + alpha=0.001, + beta1=0.9, + beta2=0.999, + weight_decay=0.0, + epsilon=1e-8, + ): + self.handle = ffc().flexflow_adam_optimizer_create( + ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon ) - self._nb_layers += 1 + self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy) - def create_tensor(self, dims, data_type, create_grad=True): - """Instantiate a FlexFlow tensor. + def set_learning_rate(self, learning_rate): + ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate) - :param x: a shape tuple/list (integers), including the batch size. - :type x: list of int - :param data_type: the datatype of the created tensor. Options are - DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN. - :type data_type: DataType +# ----------------------------------------------------------------------- +# Initializer +# ----------------------------------------------------------------------- +class Initializer(object): + __slots__ = ["handle", "p_handle"] - :param create_grad: weather the tensor creates a gradients vector. - If you don't specify anything, a gradients vector is used. - :type create_grad: bool + def __init__(self, handle, p_handle=0): + self.p_handle = ffi.new("flexflow_initializer_t *") + if handle == None: + self.p_handle.impl = ffi.NULL + else: + self.p_handle.impl = handle.impl + self.handle = self.p_handle[0] + assert ffi.typeof(self.handle) == ffi.typeof( + "flexflow_initializer_t" + ), "Initializer handle is wrong" - :returns: Tensor -- the output tensor. - """ - c_dims = ffi.new("int[]", dims) - c_data_type = enum_to_int(DataType, data_type) - num_dims = len(dims) - handle = ffc().flexflow_tensor_create( - self.handle, num_dims, c_dims, c_data_type, create_grad - ) - return Tensor(handle) - def map_tensor(self, tensor, parallel_op=None): - op_handle = self.__get_op_handle(parallel_op) - ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle) +# ----------------------------------------------------------------------- +# GlorotUniform +# ----------------------------------------------------------------------- - def create_constant(self, dims, value, data_type): - c_dims = ffi.new("int[]", dims) - c_data_type = enum_to_int(DataType, data_type) - num_dims = len(dims) - handle = ffc().flexflow_constant_create( - self.handle, num_dims, c_dims, value, c_data_type - ) - return Tensor(handle) - def exp(self, x, name=None): - """Exponential activation function. +class GlorotUniformInitializer(Initializer): + __slots__ = ["glorot_handle", "_glorot_handle"] - :param x: the input Tensor. - :type x: Tensor + def __init__(self, seed): + self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed) + self._glorot_handle = ffi.gc( + self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy + ) + super(GlorotUniformInitializer, self).__init__(self.glorot_handle) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name) - self.add_layer(OpType.EXP, name) - return Tensor(handle, owner_op_type=OpType.EXP) +# ----------------------------------------------------------------------- +# ZeroInitializer +# ----------------------------------------------------------------------- - def sin(self, x, name=None): - """Elementwise sine function. - :param x: the input Tensor. - :type x: Tensor +class ZeroInitializer(Initializer): + __slots__ = ["zero_handle", "_zero_handle"] - :param name: the name of the layer. Default is None. - :type name: string + def __init__(self): + self.zero_handle = ffc().flexflow_zero_initializer_create() + self._zero_handle = ffi.gc( + self.zero_handle, ffc().flexflow_zero_initializer_destroy + ) + super(ZeroInitializer, self).__init__(self.zero_handle) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name) - self.add_layer(OpType.SIN, name) - return Tensor(handle, owner_op_type=OpType.SIN) - def cos(self, x, name=None): - """Elementwise cosine function. +# ----------------------------------------------------------------------- +# UniformInitializer +# ----------------------------------------------------------------------- - :param x: the input Tensor. - :type x: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class UniformInitializer(Initializer): + __slots__ = ["uniform_handle", "_uniform_handle"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name) - self.add_layer(OpType.COS, name) - return Tensor(handle, owner_op_type=OpType.COS) + def __init__(self, seed, minv, maxv): + self.uniform_handle = ffc().flexflow_uniform_initializer_create( + seed, minv, maxv + ) + self._uniform_handle = ffi.gc( + self.uniform_handle, ffc().flexflow_uniform_initializer_destroy + ) + super(UniformInitializer, self).__init__(self.uniform_handle) - def add(self, x, y, inplace_a=False, name=None): - """Layer that adds two input Tensors, :attr:`output = x + y`. - :param x: the first input Tensor. - :type x: Tensor +# ----------------------------------------------------------------------- +# NormInitializer +# ----------------------------------------------------------------------- - :param y: the second input Tensor. - :type y: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class NormInitializer(Initializer): + __slots__ = ["norm_handle", "_norm_handle"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_add( - self.handle, x.handle, y.handle, inplace_a, c_name + def __init__(self, seed, mean, stddev): + self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev) + self._norm_handle = ffi.gc( + self.norm_handle, ffc().flexflow_norm_initializer_destroy ) - self.add_layer(OpType.ADD, name) - return Tensor(handle, owner_op_type=OpType.ADD) + super(NormInitializer, self).__init__(self.norm_handle) - def subtract(self, x, y, inplace_a=False, name=None): - """Layer that subtracts two input Tensors, :attr:`output = x * y`. - :param x: the first input Tensor. - :type x: Tensor +# ----------------------------------------------------------------------- +# PerfMetrics +# ----------------------------------------------------------------------- - :param y: the second input Tensor. - :type y: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class PerfMetrics(object): + __slots__ = ["handle", "_handle"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_subtract( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.SUBTRACT, name) - return Tensor(handle, owner_op_type=OpType.SUBTRACT) + def __init__(self, handle): + self.handle = handle + self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy) - def multiply(self, x, y, inplace_a=False, name=None): - """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`. + def get_accuracy(self): + return ffc().flexflow_per_metrics_get_accuracy(self.handle) - :param x: the first input Tensor. - :type x: Tensor - :param y: the second input Tensor. - :type y: Tensor +# ----------------------------------------------------------------------- +# NetConfig +# ----------------------------------------------------------------------- - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_multiply( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.MULTIPLY, name) - return Tensor(handle, owner_op_type=OpType.MULTIPLY) +class NetConfig(object): + def __init__(self): + self.handle = ffc().flexflow_net_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy) + cpath = ffc().flexflow_net_config_get_dataset_path(self.handle) + self.dataset_path = ffi.string(cpath) - def divide(self, x, y, inplace_a=False, name=None): - """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`. - :param x: the first input Tensor. - :type x: Tensor +# ----------------------------------------------------------------------- +# DLRMConfig +# ----------------------------------------------------------------------- - :param y: the second input Tensor. - :type y: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class DLRMConfig(object): + def __init__(self): + self.handle = ffc().flexflow_dlrm_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_divide( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.DIVIDE, name) - return Tensor(handle, owner_op_type=OpType.DIVIDE) + cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle) + self.dataset_path = ffi.string(cstr) - def max(self, x, y, inplace_a=False, name=None): - """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`. + cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle) + self.arch_interaction_op = ffi.string(cstr) - :param x: the first input Tensor. - :type x: Tensor + self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size( + self.handle + ) + self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle) + self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle) + self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size( + self.handle + ) + self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle) - :param y: the second input Tensor. - :type y: Tensor + mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle) + self.mlp_bot = [] + for i in range(0, mlp_bot_c[0]): + self.mlp_bot.append(mlp_bot_c[i + 1]) - :param name: the name of the layer. Default is None. - :type name: string + mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle) + self.mlp_top = [] + for i in range(0, mlp_top_c[0]): + self.mlp_top.append(mlp_top_c[i + 1]) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_max( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.MAX, name) - return Tensor(handle, owner_op_type=OpType.MAX) + embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle) + self.embedding_size = [] + for i in range(0, embedding_size_c[0]): + self.embedding_size.append(embedding_size_c[i + 1]) - def min(self, x, y, inplace_a=False, name=None): - """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`. - :param x: the first input Tensor. - :type x: Tensor +# ----------------------------------------------------------------------- +# Single DataLoader +# ----------------------------------------------------------------------- - :param y: the second input Tensor. - :type y: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class SingleDataLoader(object): + __slots__ = ["handle", "_handle"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_min( - self.handle, x.handle, y.handle, inplace_a, c_name + def __init__(self, ffmodel, input, full_input, num_samples, data_type): + assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong" + assert type(input) is Tensor, "SingleDataLoader input is wrong" + if type(full_input) is Tensor: + self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type) + else: + self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type) + self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy) + + def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type): + assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" + c_data_type = enum_to_int(DataType, data_type) + self.handle = ffc().flexflow_single_dataloader_create( + ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type ) - self.add_layer(OpType.MIN, name) - return Tensor(handle, owner_op_type=OpType.MIN) - def reduce_sum(self, input, axes, keepdims=False, name=None): - """Layer that computes the sum of the input Tensor along given axes. + def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type): + # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" + c_data_type = enum_to_int(DataType, data_type) + self.handle = ffc().flexflow_single_dataloader_create2( + ffmodel.handle, input.handle, full_input, num_samples, c_data_type + ) - :param input: the input Tensor. - :type input: Tensor + @property + def num_samples(self): + return ffc().flexflow_single_dataloader_get_num_samples(self.handle) - :param axes: the axes along which reduction is applied - :type axes: List[int] + @num_samples.setter + def num_samples(self, samples): + ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples) - :param name: the name of the layer. Default is None. - :type name: string + def next_batch(self, ffmodel): + """Ask the dataloder to load the next batch to the :attr:`batch_tensor`. - :returns: Tensor -- the output tensor. + :returns: None -- no returns. """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handle = ffc().flexflow_model_add_reduce_sum( - self.handle, input.handle, c_axes, len(axes), keepdims, c_name - ) - self.add_layer(OpType.REDUCE_SUM, name) - return Tensor(handle, owner_op_type=OpType.REDUCE_SUM) + ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle) - def rsqrt(self, input, name=None): - """Layer that computes the element-wise reciprocal square-root. + def reset(self): + """Reset the current position of the dataloder to 0. - :param input: the input Tensor. - :type input: Tensor + :returns: None -- no returns. + """ + ffc().flexflow_single_dataloader_reset(self.handle) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name) - self.add_layer(OpType.RSQRT, name) - return Tensor(handle, owner_op_type=OpType.RSQRT) +class RegionNdarray(object): + __slots__ = ["__array_interface__"] - def pow(self, input, exponent, name=None): - """Layer that computes the element-wise power. + def __init__(self, shape, data_type, base_ptr, strides, read_only): + # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html + if data_type == DataType.DT_HALF: + field_type = " requests; - int finetuning_req_idx = 0; + int finetuning_req_idx = 0; for (int i = 0; i < num_requests; i++) { if (request_types[i] == RequestType::REQ_INFERENCE) { std::string const text_str(input_texts[i]); Request inference_req; inference_req.prompt = text_str; inference_req.max_sequence_length = max_seq_lengths[i]; - if (peft_model_ids[i] != nullptr) { - PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + if (peft_model_id != nullptr) { inference_req.peft_model_id = *peft_model_id; } requests.push_back(inference_req); @@ -1646,14 +1646,14 @@ void flexflow_model_generate(flexflow_model_t handle_, Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; fine_tuning_req.max_sequence_length = max_seq_lengths[i]; - if (peft_model_ids[i] != nullptr) { - PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + if (peft_model_id != nullptr) { fine_tuning_req.peft_model_id = *peft_model_id; } std::string const dataset_fp(dataset_filepaths[finetuning_req_idx]); fine_tuning_req.dataset_filepath = dataset_fp; fine_tuning_req.max_training_steps = training_steps[finetuning_req_idx]; - requests.push_back(finetuning_req_idx); + requests.push_back(fine_tuning_req); DEBUG_PRINT("[Model] generate[%d] %p %s %i %i", i, handle, @@ -1668,11 +1668,11 @@ void flexflow_model_generate(flexflow_model_t handle_, for (int i = 0; i < num_requests; i++) { if (request_types[i] == RequestType::REQ_INFERENCE) { - // If the prompt exceeds max seq len, check that we return the prompt with no - // additional token. Otherwise, check that the output does not exceed the max - // sequence length. - assert(results[i].output_tokens.size() <= max_seq_length || - results[i].output_tokens.size() == results[i].input_tokens.size()); + // If the prompt exceeds max seq len, check that we return the prompt with + // no additional token. Otherwise, check that the output does not exceed + // the max sequence length. + assert(results[i].output_tokens.size() <= max_seq_lengths[i] || + results[i].output_tokens.size() == results[i].input_tokens.size()); output_length_and_tokens[i][0] = results[i].output_tokens.size(); std::copy(results[i].output_tokens.begin(), results[i].output_tokens.end(), diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 8fb040fb6d..0afde30c64 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,7 @@ namespace FlexFlow { using namespace Legion; using tokenizers::Tokenizer; +using json = nlohmann::json; LegionRuntime::Logger::Category log_req_mgr("RequestManager"); @@ -242,7 +244,7 @@ RequestManager::RequestGuid request.guid = next_available_guid++; request.max_sequence_length = request_.max_sequence_length; request.peft_model_id = request_.peft_model_id; - request.req_type = Request::REQ_FINETUNING; + request.req_type = RequestType::REQ_FINETUNING; request.completed_training_steps = 0; request.max_training_steps = request_.max_training_steps; request.dataset_filepath = request_.dataset_filepath; @@ -385,7 +387,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, size_t guid = old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid; Request &request = all_requests[guid]; - if (request.req_type == Request::REQ_FINETUNING) { + if (request.req_type == RequestType::REQ_FINETUNING) { // No new tokens generated when in fine-tuning mode continue; } else if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < @@ -415,7 +417,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - if (request.req_type == Request::REQ_FINETUNING) { + if (request.req_type == RequestType::REQ_FINETUNING) { // fine-tuning requests don't automatically carry over to the next // batch, we only do so if there is space left after adding new // inference requests @@ -575,7 +577,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { Request new_request = pending_infr_request_queue.front(); - assert(new_request.req_type == Request::REQ_INFERENCE); + assert(new_request.req_type == RequestType::REQ_INFERENCE); pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; @@ -617,9 +619,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // Step 4: add PEFT bwd requests, if there is additional space while (pending_peft_request_queue.size() > 0) { Request &request = pending_peft_request_queue.front(); - assert(request.req_type = Request::REQ_FINETUNING); + assert(request.req_type = RequestType::REQ_FINETUNING); Request &all_req_handle = all_requests[request.guid]; - assert(all_req_handle.req_type = Request::REQ_FINETUNING); + assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); if (all_req_handle.status == Request::COMPLETED) { pending_peft_request_queue.pop(); } else { @@ -628,11 +630,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } if (pending_peft_request_queue.size() > 0) { Request &request = pending_peft_request_queue.front(); - assert(request.req_type = Request::REQ_FINETUNING); + assert(request.req_type = RequestType::REQ_FINETUNING); assert(request.dataset.size() > 0); // update status and training steps Request &all_req_handle = all_requests[request.guid]; - assert(all_req_handle.req_type = Request::REQ_FINETUNING); + assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); request.completed_training_steps = all_req_handle.completed_training_steps; request.status = all_req_handle.status; assert(request.status != Request::COMPLETED); @@ -2424,7 +2426,7 @@ std::vector std::vector guids; for (int i = 0; i < requests.size(); i++) { RequestManager::RequestGuid guid; - if (requests.at(i).req_type == Request::REQ_INFERENCE) { + if (requests.at(i).req_type == RequestType::REQ_INFERENCE) { guid = rm->register_new_request(requests.at(i)); } else { guid = rm->register_new_peft_request(requests.at(i)); diff --git a/tests/peft_test.sh b/tests/peft_test.sh index 778b225a26..9b4a5204ac 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -25,4 +25,8 @@ export LEGION_BACKTRACE=1 python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m # if first time, add: --refresh-cache +# CPP test ../build/inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft + +# Python test +python ../inference/python/ff_peft.py From c3e62d004dd2ca7d6ef71b64ac2cba1aa4f23539 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 25 Mar 2024 19:23:11 +0000 Subject: [PATCH 29/32] fix --- python/flexflow/serve/serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 0960dcaf90..63582038ac 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -413,7 +413,7 @@ def compile( # Add PEFT layer if registered for _, peft_dict in self.pefts.items(): ff_peft_config = peft_dict["ff_peft_config"] - ff_peft_model_id = self.model.add_lora_layer(ff_peft_config) + ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config) peft_dict["ff_peft_model_id"] = ff_peft_model_id # Download the weights from huggingface (if needed) From ce9803abacb050fcb1cf0fb360543dbc04cd96a5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 26 Mar 2024 19:16:42 +0000 Subject: [PATCH 30/32] fixes --- include/flexflow/flexflow_c.h | 3 +++ inference/python/ff_peft.py | 12 +++++---- python/flexflow/core/flexflow_cffi.py | 13 ++++++--- python/flexflow/serve/serve.py | 39 ++++++++++++++++++--------- 4 files changed, 46 insertions(+), 21 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 8150e05dd1..004523e875 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -595,6 +595,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, bool beam_search, char const *name); +flexflow_peft_model_id_t flexflow_model_add_lora_layer( + flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_); + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle, flexflow_sgd_optimizer_t optimizer); diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py index 18ef8bbf33..b5242945b6 100644 --- a/inference/python/ff_peft.py +++ b/inference/python/ff_peft.py @@ -54,7 +54,7 @@ def get_configs(): "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, - "enable_peft": False, + "enable_peft": True, "peft_activation_reserve_space_size": 1024, # 1GB "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, @@ -121,17 +121,19 @@ def main(): if len(configs.prompt) > 0: prompts = [s for s in json.load(open(configs.prompt))] inference_requests = [ - Request(RequestType.REQ_INFERENCE, prompt=prompt, max_sequence_length=128) + ff.Request( + ff.RequestType.REQ_INFERENCE, prompt=prompt, max_sequence_length=128 + ) for prompt in prompts ] requests += inference_requests # Finetuning if len(configs.finetuning_dataset) > 0: for peft_model_id in configs.peft_model_ids: - finetuning_request = Request( - RequestType.REQ_FINETUNING, + finetuning_request = ff.Request( + ff.RequestType.REQ_FINETUNING, max_sequence_length=128, - peft_model_id=peft_model_id, + peft_model_id=llm.get_ff_peft_id(peft_model_id), dataset_filepath=configs.finetuning_dataset, ) requests.append(finetuning_request) diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index aa762fc1af..82c3eb059c 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1781,10 +1781,10 @@ def __init__( self, req_type: RequestType, prompt: str = None, - max_sequence_length: int = None, + max_sequence_length: int = 128, peft_model_id: PEFTModelID = None, dataset_filepath: str = None, - max_training_steps: int = None, + max_training_steps: int = 1, ): self.req_type = req_type self.prompt = prompt @@ -4013,6 +4013,11 @@ def argmax(self, input, beam_search, name=None): self.add_layer(OpType.ARGMAX, name) return Tensor(handle, owner_op_type=OpType.ARGMAX) + def add_lora_layer(self, peft_config): + handle = ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle) + return handle + # self.add_layer(OpType.LORA, name) + def reset_metrics(self): """Reset performance metrics. @@ -4442,7 +4447,9 @@ def generate(self, requests_list: List[Request]): request.max_sequence_length for request in requests_list ] peft_model_ids = [request.peft_model_id for request in requests_list] - dataset_filepaths = [request.dataset_filepath for request in requests_list] + dataset_filepaths = [ + get_c_name(request.dataset_filepath) for request in requests_list + ] training_steps = [request.max_training_steps for request in requests_list] ffc().flexflow_model_generate( self.handle, diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 63582038ac..a9efee341f 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -133,6 +133,18 @@ def add_peft(self, peft_model_id: str): } self.pefts[peft_model_id] = peft_dict + def get_ff_peft_id(self, peft_model_id: str) -> PEFTModelID: + if peft_model_id not in self.pefts: + raise ValueError( + f"PEFT {peft_model_id} not registered with LLM {self.model_name}" + ) + peft_dict = self.pefts[peft_model_id] + if "ff_peft_model_id" not in peft_dict: + raise RuntimeError( + f"Attempting to run PEFT {peft_model_id} before compiling LLM {self.model_name}" + ) + return peft_dict["ff_peft_model_id"] + def download_hf_config(self): """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" config_dir = os.path.join( @@ -224,10 +236,9 @@ def get_hf_llm(model_name): ) def download_llm_weights(): - weights_path = get_weights_path(self.model_name) refresh_cache_if_needed(self.model_name) ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, weights_path + self.model_name, self.weights_path ) if ff_revision != latest_revision: print( @@ -235,7 +246,7 @@ def download_llm_weights(): ) hf_model = get_hf_llm(self.model_name) # Convert the model to FlexFlow format - self.model_class.convert_hf_model(hf_model, weights_path) + self.model_class.convert_hf_model(hf_model, self.weights_path) # Save new revision hash to file with open(ff_revision_file, "w+") as f: f.write(latest_revision) @@ -257,7 +268,7 @@ def convert_peft_model(hf_peft_model, peft_type, weights_path): def download_peft_weights(): for peft_model_id, peft_dict in self.pefts.items(): peft_config = peft_dict["peft_config"] - peft_type = peft_config["peft_type"] + peft_type = peft_dict["peft_type"] weights_path = get_weights_path(peft_model_id) refresh_cache_if_needed(peft_model_id) @@ -285,6 +296,7 @@ def download_peft_weights(): gc.collect() torch.cuda.empty_cache() + self.weights_path = get_weights_path(self.model_name) download_llm_weights() download_peft_weights() @@ -295,24 +307,24 @@ def download_hf_tokenizer_if_needed(self): print("Loading tokenizer...") # Use local cache, or download new version - tokenizer_path = os.path.join( + self.tokenizer_path = os.path.join( os.path.expanduser(self.cache_path), "tokenizers", self.model_name.lower(), ) if self.refresh_cache: print( - f"Refreshing cached tokenizer for model {self.model_name} at path {tokenizer_path} ..." + f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..." ) - if os.path.exists(tokenizer_path): - shutil.rmtree(tokenizer_path) - if not os.path.exists(tokenizer_path): - print(f"Creating directory {tokenizer_path} (if it doesn't exist)...") - os.makedirs(tokenizer_path, exist_ok=True) + if os.path.exists(self.tokenizer_path): + shutil.rmtree(self.tokenizer_path) + if not os.path.exists(self.tokenizer_path): + print(f"Creating directory {self.tokenizer_path} (if it doesn't exist)...") + os.makedirs(self.tokenizer_path, exist_ok=True) # Get local revision SHA, check if it matches latest one on huggingface ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, tokenizer_path + self.model_name, self.tokenizer_path ) if ff_revision != latest_revision: @@ -327,7 +339,7 @@ def download_hf_tokenizer_if_needed(self): else: hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name) # Save tokenizer - hf_tokenizer.save_pretrained(tokenizer_path) + hf_tokenizer.save_pretrained(self.tokenizer_path) print("Done updating HF tokenizer.") # Save new revision hash to file with open(ff_revision_file, "w+") as f: @@ -490,6 +502,7 @@ def generate( requests_or_prompts, max_length ) else: + print(requests_or_prompts) return self.model.ffmodel.generate(requests_or_prompts) else: assert False, "Please pass a non-empty string or list of strings" From b841789b3152961ff0bd6bfc7657720982664579 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 26 Mar 2024 22:10:59 +0000 Subject: [PATCH 31/32] fix --- python/flexflow/serve/serve.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index a9efee341f..538abe3858 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -123,13 +123,9 @@ def add_peft(self, peft_model_id: str): raise RuntimeError( f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}" ) - ff_peft_config = LoraLinearConfig( - os.path.expanduser(self.cache_path), peft_model_id - ) peft_dict = { "peft_config": peft_config, "peft_type": peft_type, - "ff_peft_config": ff_peft_config, } self.pefts[peft_model_id] = peft_dict @@ -158,12 +154,14 @@ def download_hf_config(self): # Save PEFT configs if the LLM has any registered PEFTs for peft_model_id, peft_dict in self.pefts.items(): - peft_config = peft_dict["hf_config"] - peft_config_path = os.path.join( + peft_config = peft_dict["peft_config"] + peft_config_dir = os.path.join( os.path.expanduser(self.cache_path), "configs", - self.peft_model_id.lower(), + peft_model_id.lower(), ) + os.makedirs(peft_config_dir, exist_ok=True) + peft_config_path = os.path.join(peft_config_dir, "config.json") print(f"Saving {peft_model_id} configs to file {peft_config_path}...") with open(peft_config_path, "w") as json_file: @@ -423,8 +421,11 @@ def compile( ) # Add PEFT layer if registered - for _, peft_dict in self.pefts.items(): - ff_peft_config = peft_dict["ff_peft_config"] + for peft_model_id, peft_dict in self.pefts.items(): + # ff_peft_config = peft_dict["ff_peft_config"] + ff_peft_config = LoraLinearConfig( + os.path.expanduser(self.cache_path), peft_model_id + ) ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config) peft_dict["ff_peft_model_id"] = ff_peft_model_id From c31f6b131e66229dd45ed3d563585579a3093c81 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 27 Mar 2024 17:10:19 +0000 Subject: [PATCH 32/32] fixes --- include/flexflow/request_manager.h | 1 + inference/python/ff_peft.py | 10 +++--- python/flexflow/serve/models/falcon.py | 3 +- python/flexflow/serve/models/llama.py | 3 +- python/flexflow/serve/models/mpt.py | 3 +- python/flexflow/serve/models/opt.py | 3 +- python/flexflow/serve/models/starcoder.py | 3 +- src/runtime/request_manager.cc | 43 +++++++++++++++++++++++ 8 files changed, 59 insertions(+), 10 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 0ef5efcf27..bf6e475cbb 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -84,6 +84,7 @@ struct Request { std::vector, std::vector>> dataset; + friend std::ostream &operator<<(std::ostream &os, Request const &req); }; // store the result of beam search diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py index b5242945b6..38a25fb614 100644 --- a/inference/python/ff_peft.py +++ b/inference/python/ff_peft.py @@ -41,15 +41,15 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 2, - "memory_per_gpu": 14000, - "zero_copy_memory_per_node": 40000, + "num_gpus": 1, + "memory_per_gpu": 8192, + "zero_copy_memory_per_node": 12000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, "data_parallelism_degree": 1, "tensor_parallelism_degree": 1, - "pipeline_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, "offload": False, "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, @@ -58,7 +58,7 @@ def get_configs(): "peft_activation_reserve_space_size": 1024, # 1GB "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, - "inference_debugging": False, + "inference_debugging": True, "fusion": True, } model_configs = { diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index e4d7786262..0176a1dda1 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -233,7 +233,8 @@ def build_model(self, max_tokens_per_batch): output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(lm_head, 1, False) - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 6aef540342..947878f706 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -241,7 +241,8 @@ def build_model(self, max_tokens_per_batch): output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(dense, 1, False) - output = ffmodel.argmax(dense, False) + softmax = ffmodel.softmax(dense, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 76f7d69c73..1d1837c478 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -244,7 +244,8 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(dense, -1) output = ffmodel.sampling(softmax, self.generation_config.topp) else: - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index f725a08e65..cde25f2241 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -273,7 +273,8 @@ def build_model(self, max_tokens_per_batch): output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(lm_head, 1, False) - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 8ed8fcfa18..80b4be10bb 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -212,7 +212,8 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(dense, -1) output = ffmodel.sampling(softmax, self.generation_config.topp) else: - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 0afde30c64..9dc0361316 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -47,6 +47,48 @@ std::string LoadBytesFromFile(std::string const &path) { return data; } +std::ostream &operator<<(std::ostream &os, Request const &req) { + os << "Request {\n"; + os << " guid: " << req.guid << "\n"; + os << " peft_model_id: " << req.peft_model_id << "\n"; + os << " max_sequence_length: " << req.max_sequence_length << "\n"; + os << " initial_len: " << req.initial_len << "\n"; + os << " ssm_cache_size: " << req.ssm_cache_size << "\n"; + os << " llm_cache_size: " << req.llm_cache_size << "\n"; + os << " status: " << static_cast(req.status) << "\n"; + os << " tokens: ["; + for (auto const &token : req.tokens) { + os << token << " "; + } + os << "]\n"; + os << " prompt: " << req.prompt << "\n"; + // os << " beam_trees: ["; + // for (const auto& tree : req.beam_trees) { + // // Assuming BeamTree has its own << operator defined + // os << tree << " "; + // } + // os << "]\n"; + os << " req_type: " << static_cast(req.req_type) << "\n"; + os << " completed_training_steps: " << req.completed_training_steps << "\n"; + os << " max_training_steps: " << req.max_training_steps << "\n"; + os << " dataset_filepath: " << req.dataset_filepath << "\n"; + os << " dataset: ["; + for (auto const &pair : req.dataset) { + os << "["; + for (auto const &token : pair.first) { + os << token << " "; + } + os << "], ["; + for (auto const &token : pair.second) { + os << token << " "; + } + os << "] "; + } + os << "]\n"; + os << "}\n"; + return os; +} + RequestManager::RequestManager() : request_manager_status(INITIALIZED), verbose(false), next_available_guid(1000000), num_processed_requests(0), @@ -242,6 +284,7 @@ RequestManager::RequestGuid Request request; request.status = Request::PENDING; request.guid = next_available_guid++; + request.initial_len = 0; request.max_sequence_length = request_.max_sequence_length; request.peft_model_id = request_.peft_model_id; request.req_type = RequestType::REQ_FINETUNING;