From 5f02b7c45132f5d556d7826adce6531449280046 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 19 Feb 2024 20:42:24 +0000
Subject: [PATCH 01/32] update script

---
 tests/peft/hf_serve.py | 70 ++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 27 deletions(-)

diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
index 1fde4d5a50..7bfc560cc2 100644
--- a/tests/peft/hf_serve.py
+++ b/tests/peft/hf_serve.py
@@ -1,6 +1,6 @@
 import argparse
 import torch
-import os, sys, shutil
+import os, sys, shutil, json
 from peft import PeftModel, PeftConfig
 from transformers import (
     AutoModelForCausalLM,
@@ -40,11 +40,12 @@ def peft_post_forward_hook(module, input, output):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--peft-model-id", type=str, default="./finetuned-llama")
+    parser.add_argument("--peft-model-id", type=str, required=True)
     parser.add_argument(
         "--use-full-precision", action="store_true", help="Use full precision"
     )
-    parser.add_argument("--max-new-tokens", type=int, default=50)
+    parser.add_argument("--max-length", type=int, default=50)
+    parser.add_argument("--prompt-file", type=str, required=True)
     parser.add_argument("--do-sample", action="store_true", help="Use sampling")
     parser.add_argument(
         "--save-peft-tensors",
@@ -52,24 +53,28 @@ def main():
         help="Save PEFT hidden states and weights to file",
     )
     args = parser.parse_args()
-    peft_model_id = args.peft_model_id
-    use_full_precision = args.use_full_precision
-    max_new_tokens = args.max_new_tokens
-    save_peft_tensors = args.save_peft_tensors
 
-    # Change working dir to folder storing this script
-    abspath = os.path.abspath(__file__)
-    dname = os.path.dirname(abspath)
-    os.chdir(dname)
+    # Check if prompt-file exists
+    if not os.path.isfile(args.prompt_file):
+        print(f"Error: {args.prompt_file} does not exist.")
+        return
 
-    config = PeftConfig.from_pretrained(peft_model_id)
+    # Get peft model config
+    config = PeftConfig.from_pretrained(args.peft_model_id)
+    
+    # Load the base model
     model = AutoModelForCausalLM.from_pretrained(
         config.base_model_name_or_path,
         return_dict=True,
         # load_in_8bit=True,
-        torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
         device_map="auto",
     )
+    # Load the Lora model
+    model = PeftModel.from_pretrained(model, args.peft_model_id)
+    print(model)
+    
+    # Get tokenizer
     hf_config = AutoConfig.from_pretrained(
         config.base_model_name_or_path, trust_remote_code=True
     )
@@ -78,25 +83,26 @@ def main():
         tokenizer = LlamaTokenizer.from_pretrained(
             config.base_model_name_or_path,
             use_fast=True,
-            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
         )
     else:
         tokenizer = AutoTokenizer.from_pretrained(
             config.base_model_name_or_path,
-            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
         )
+    
     # Generation config
     generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path)
     generation_config.do_sample = args.do_sample
-    # Load the Lora model
-    model = PeftModel.from_pretrained(model, peft_model_id)
-
-    print(model)
 
     # Register hooks to save tensors, if needed
-    if save_peft_tensors:
+    if args.save_peft_tensors:
+        # Change working dir to folder storing this script
+        abspath = os.path.abspath(__file__)
+        dname = os.path.dirname(abspath)
+        os.chdir(dname)
+        # Create output dir
         shutil.rmtree("./hf_peft_tensors")
-        # Check that the output folder exists
         os.makedirs("./hf_peft_tensors", exist_ok=True)
         # Save weights
         for name, params in model.named_parameters():
@@ -112,12 +118,22 @@ def main():
                 layer.register_forward_pre_hook(peft_pre_forward_hook)
                 layer.register_forward_hook(peft_post_forward_hook)
 
-    batch = tokenizer("Two things are infinite: ", return_tensors="pt")
-    with torch.cuda.amp.autocast():
-        output_tokens = model.generate(
-            **batch, max_new_tokens=max_new_tokens, generation_config=generation_config
-        )
-    print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False))
+    # Run inference
+    # Read prompt-file into a list of strings
+    with open(args.prompt_file, "r") as f:
+        try:
+            prompt_list = json.load(f)
+        except json.JSONDecodeError:
+            print(f"Error: Unable to parse {args.prompt_file} as JSON.")
+            sys.exit(1)
+    
+    for i, prompt in enumerate(prompt_list):
+        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+        with torch.cuda.amp.autocast():
+            output_tokens = model.generate(
+                **batch, max_new_tokens=args.max_length, generation_config=generation_config
+            )
+        print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False))
 
 
 if __name__ == "__main__":

From e82a75f3c46d51d848a2f6314c1daf99bac70b27 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 20 Feb 2024 16:56:28 +0000
Subject: [PATCH 02/32] less model renaming

---
 inference/models/falcon.cc                |  34 ++-----
 inference/models/llama.cc                 | 104 ++++++++++------------
 inference/models/mpt.cc                   |  35 +++-----
 inference/models/opt.cc                   |  38 +++-----
 inference/models/starcoder.cc             |  26 ++----
 python/flexflow/serve/__init__.py         |   4 +-
 python/flexflow/serve/models/falcon.py    |  32 ++++---
 python/flexflow/serve/models/llama.py     |  40 +++------
 python/flexflow/serve/models/mpt.py       |  34 +++----
 python/flexflow/serve/models/opt.py       |  36 ++++----
 python/flexflow/serve/models/starcoder.py |  44 ++++-----
 python/flexflow/serve/serve.py            |  34 ++++---
 12 files changed, 186 insertions(+), 275 deletions(-)

diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index f86130ff2b..195d6ba7e3 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -76,7 +76,7 @@ void FALCON::create_falcon_model(FFModel &ff,
           falcon_config.layer_norm_epsilon,
           true,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_layer_norm(
@@ -91,7 +91,7 @@ void FALCON::create_falcon_model(FFModel &ff,
           true,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = res_ln_outputs[0];
       att_norm = res_ln_outputs[1];
@@ -117,7 +117,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -142,7 +142,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -167,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -188,7 +188,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h")
             .c_str());
 
     dense_h_to_4h = ff.gelu(dense_h_to_4h);
@@ -204,7 +204,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h")
             .c_str());
   }
   // final normalization and linear
@@ -254,26 +254,6 @@ void FALCON::create_falcon_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            falcon_config.n_head,
-                            falcon_config.n_head_kv,
-                            falcon_config.hidden_size,
-                            falcon_config.hidden_size / falcon_config.n_head,
-                            ff.config.tensor_parallelism_degree);
-  std::cout << "------load weights ----------" << std::endl;
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 0db7796567..a7a1758cc3 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -58,7 +58,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "tok_embeddings");
+                              "embed_tokens");
 
   Tensor w2 = nullptr;
 
@@ -75,7 +75,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_rms_norm(
@@ -86,7 +86,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           llama_config.hidden_size,
           false, // inplace_residual
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = token_att_norm[0];
       att_norm = token_att_norm[1];
@@ -112,7 +112,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -135,7 +135,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -158,7 +158,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -178,59 +178,57 @@ void LLAMA::create_llama_model(FFModel &ff,
         llama_config.hidden_size,
         false, // inplace_residual
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str());
+        std::string("layers." + std::to_string(i) + ".post_attention_layernorm")
+            .c_str());
     token = token_ff_norm[0];
     Tensor ff_norm = token_ff_norm[1];
 
-    Tensor w1 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w1")
-                     .c_str());
+    Tensor w1 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str());
 
-    Tensor w3 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w3")
-                     .c_str());
+    Tensor w3 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str());
 
     Tensor multi = ff.sigmoid_silu_multi(w1, w3);
 
-    w2 =
-        ff.dense(multi,
-                 llama_config.hidden_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
-                     .c_str());
+    w2 = ff.dense(
+        multi,
+        llama_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
     ff.lora_linear(
         multi,
         w2,
         OP_LORA_MLP_SECOND,
-        std::string("layers_" + std::to_string(i) + "_feed_forward_w2_lora")
+        std::string("layers." + std::to_string(i) + ".mlp.down_proj.lora")
             .c_str());
   }
   // final normalization and linear
@@ -254,7 +252,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                           nullptr,
                           REG_MODE_NONE,
                           0.0f,
-                          "output");
+                          "lm_head");
 
   Tensor output;
   if (mode == BEAM_SEARCH_MODE) {
@@ -288,16 +286,6 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  im->compile_model_and_allocate_buffer(&ff);
-  fileloader.load_weights(&ff);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 95179691a1..e4a7e0056d 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -58,7 +58,7 @@ void MPT::create_mpt_model(FFModel &ff,
                                       use_full_precision ? DT_FLOAT : DT_HALF,
                                       NULL,
                                       embed_init,
-                                      "transformer_wte");
+                                      "wte");
 
   Tensor intermediate_output = nullptr, layernorm_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
@@ -74,7 +74,7 @@ void MPT::create_mpt_model(FFModel &ff,
           1e-05,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
     } else {
       ff.residual_layer_norm(
           intermediate_output,
@@ -88,7 +88,7 @@ void MPT::create_mpt_model(FFModel &ff,
           false,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
       hidden_states = res_ln_outputs[0];
       layernorm_output = res_ln_outputs[1];
     }
@@ -114,7 +114,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -138,7 +138,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -162,7 +162,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -184,7 +184,7 @@ void MPT::create_mpt_model(FFModel &ff,
         false,
         false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_norm_2").c_str());
+        std::string("layers." + std::to_string(i) + ".norm_2").c_str());
     hidden_states = res_ln_outputs[0];
     layernorm_output = res_ln_outputs[1];
 
@@ -200,7 +200,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str());
     layernorm_output = ff.gelu(layernorm_output);
     intermediate_output = ff.dense(
         layernorm_output,
@@ -213,7 +213,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str());
   }
 
   // final
@@ -228,7 +228,7 @@ void MPT::create_mpt_model(FFModel &ff,
                          false,
                          false,
                          DT_NONE,
-                         "transformer_norm_f");
+                         "norm_f");
   Tensor all_final_norm = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(all_final_norm,
@@ -262,21 +262,6 @@ void MPT::create_mpt_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  //------------------- compile the model --------------------------------
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            mpt_config.n_heads,
-                            mpt_config.n_heads,
-                            mpt_config.hidden_size,
-                            mpt_config.hidden_size / mpt_config.n_heads,
-                            ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 7d2abad829..6d04ba47f2 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -96,7 +96,7 @@ void OPT::create_opt_model(FFModel &ff,
         true,
         false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_attention_layer_norm")
+        std::string("layers." + std::to_string(i) + ".self_attn_layer_norm")
             .c_str());
     Tensor residual = res_ln_outputs[0];
     Tensor hidden_states = res_ln_outputs[1];
@@ -122,7 +122,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -146,7 +146,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -170,7 +170,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -189,8 +189,8 @@ void OPT::create_opt_model(FFModel &ff,
                                     true,
                                     false,
                                     DT_NONE,
-                                    std::string("layers_" + std::to_string(i) +
-                                                "_add_bias_residual_layer_norm")
+                                    std::string("layers." + std::to_string(i) +
+                                                ".add_bias_residual_layer_norm")
                                         .c_str());
     added = res_ln_outputs[0];
     Tensor final_norm = res_ln_outputs[1];
@@ -207,7 +207,7 @@ void OPT::create_opt_model(FFModel &ff,
                  nullptr,
                  REG_MODE_NONE,
                  0.0f,
-                 std::string("layers_" + std::to_string(i) + "_fc1").c_str());
+                 std::string("layers." + std::to_string(i) + ".fc1").c_str());
     fc2 = ff.dense(fc1,
                    opt_config.hidden_size,
                    AC_MODE_NONE,
@@ -218,13 +218,13 @@ void OPT::create_opt_model(FFModel &ff,
                    nullptr,
                    REG_MODE_NONE,
                    0.0f,
-                   std::string("layers_" + std::to_string(i) + "_fc2").c_str());
+                   std::string("layers." + std::to_string(i) + ".fc2").c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
     ff.lora_linear(
         fc1,
         fc2,
         OP_LORA_MLP_SECOND,
-        std::string("layers_" + std::to_string(i) + "_fc2_lora").c_str());
+        std::string("layers." + std::to_string(i) + ".fc2.lora").c_str());
   }
 
   // final
@@ -252,7 +252,7 @@ void OPT::create_opt_model(FFModel &ff,
                             nullptr,
                             REG_MODE_NONE,
                             0.0f,
-                            "embed_tokens_weight_lm_head");
+                            "lm_head");
 
   Tensor output;
   if (mode == BEAM_SEARCH_MODE) {
@@ -276,24 +276,6 @@ void OPT::create_opt_model(FFModel &ff,
       use_full_precision);
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  //------------------- compile the model --------------------------------
-  std::cout << "------start compile ----------" << std::endl;
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            opt_config.num_attention_heads,
-                            opt_config.num_attention_heads,
-                            opt_config.hidden_size,
-                            opt_config.hidden_size /
-                                opt_config.num_attention_heads,
-                            ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------finished loading weights----------" << std::endl;
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index fb6269ad75..cd8bf3a9a7 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -66,7 +66,7 @@ void STARCODER::create_starcoder_model(
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "transformer_wte");
+                              "wte");
 
   Tensor positional_embedding =
       ff.embedding(position_input,
@@ -76,7 +76,7 @@ void STARCODER::create_starcoder_model(
                    use_full_precision ? DT_FLOAT : DT_HALF,
                    NULL,
                    embed_init,
-                   "transformer_wpe");
+                   "wpe");
 
   Tensor residual = nullptr, c_proj = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
@@ -98,7 +98,7 @@ void STARCODER::create_starcoder_model(
         true,
         false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_1").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_1").c_str());
     Tensor hidden_states = res_ln_outputs[0];
     Tensor ln_1 = res_ln_outputs[1];
 
@@ -125,7 +125,7 @@ void STARCODER::create_starcoder_model(
             1.0f,                        /*scaling factor*/
             true,                        /*qk_prod_scaling*/
             false,                       /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn.c_attn")
                 .c_str() /*name*/
         );
         break;
@@ -147,7 +147,7 @@ void STARCODER::create_starcoder_model(
         true,
         false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_2").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_2").c_str());
     residual = res_ln_outputs[0];
     Tensor l2_norm = res_ln_outputs[1];
 
@@ -163,7 +163,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str());
 
     c_fc = ff.gelu(c_fc);
 
@@ -178,7 +178,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str());
   }
   // final normalization and linear
   ff.residual_layer_norm(residual,
@@ -192,7 +192,7 @@ void STARCODER::create_starcoder_model(
                          true,
                          false,
                          DT_NONE,
-                         "transformer_ln_f");
+                         "ln_f");
   Tensor ln_f = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(ln_f,
@@ -235,16 +235,6 @@ void STARCODER::create_starcoder_model(
       ff.config.tensor_parallelism_degree,
       use_full_precision);
   im->register_model_weights_loader(&ff, fileloader);
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  im->compile_model_and_allocate_buffer(&ff);
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index 5805670ae0..da7dba5bcc 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -214,7 +214,7 @@ def init(
     if configs_dict.get("offload", None) is None:
         configs_dict["offload"] = False
     if configs_dict.get("offload_reserve_space_size", None) is None:
-        configs_dict["offload_reserve_space_size"] = 8*1024**3
+        configs_dict["offload_reserve_space_size"] = 8 * 1024**3
     if configs_dict.get("use_4bit_quantization", None) is None:
         configs_dict["use_4bit_quantization"] = False
     if configs_dict.get("use_8bit_quantization", None) is None:
@@ -222,7 +222,7 @@ def init(
     if configs_dict.get("enable_peft", None) is None:
         configs_dict["enable_peft"] = False
     if configs_dict.get("peft_activation_reserve_space_size", None) is None:
-        configs_dict["peft_activation_reserve_space_size"] = 8*1024**3
+        configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3
     if configs_dict.get("peft_weight_reserve_space_size", None) is None:
         configs_dict["peft_weight_reserve_space_size"] = 1024**3
     if configs_dict.get("profiling", None) is None:
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index db2f403e10..e4d7786262 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -118,7 +118,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, att_norm = ffmodel.residual_layer_norm(
@@ -129,7 +129,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -147,7 +147,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -164,7 +164,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multiquery_self_attention(
@@ -181,7 +181,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             else:
                 assert False
@@ -191,7 +191,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size * 4,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_h_to_4h",
+                name=f"layers.{i}.mlp.dense_h_to_4h",
             )
             dense_h_to_4h = ffmodel.gelu(dense_h_to_4h)
             mlp_output = ffmodel.dense(
@@ -199,7 +199,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_4h_to_h",
+                name=f"layers.{i}.mlp.dense_4h_to_h",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -239,11 +239,9 @@ def build_model(self, max_tokens_per_batch):
 
     # TODO: finish this
     def convert_hf_weight_name(name):
-        return (
-            name.replace(".", "_")
-            .replace("transformer_h_", "layers_")
-            .replace("transformer_", "")
-            .replace("self_attention_dense", "attention_wo")
+        return (name.replace("transformer.h.", "layers.")
+            .replace("transformer.", "")
+            .replace("self_attention.dense", "self_attention.o_proj")
         )
 
     def convert_hf_model(model, dst_folder):
@@ -256,10 +254,10 @@ def convert_hf_model(model, dst_folder):
         for name, params in model.named_parameters():
             name = FlexFlowFalcon.convert_hf_weight_name(name)
             # Split Q,K,V attention weights
-            if "self_attention_query_key_value" in name:
-                name_q = name.replace("self_attention_query_key_value", "attention_wq")
-                name_k = name.replace("self_attention_query_key_value", "attention_wk")
-                name_v = name.replace("self_attention_query_key_value", "attention_wv")
+            if "self_attention.query_key_value" in name:
+                name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj")
+                name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj")
+                name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -276,5 +274,5 @@ def convert_hf_model(model, dst_folder):
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         # LM head weight
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index cd9cf29ebf..6aef540342 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -101,7 +101,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="tok_embeddings",
+            name="embed_tokens",
         )
 
         for i in range(self.llama_config.num_hidden_layers):
@@ -112,7 +112,7 @@ def build_model(self, max_tokens_per_batch):
                     token,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, attn_norm = ffmodel.residual_rms_norm(
@@ -120,7 +120,7 @@ def build_model(self, max_tokens_per_batch):
                     w2,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -140,7 +140,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -159,7 +159,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multiquery_self_attention(
@@ -178,7 +178,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -188,21 +188,21 @@ def build_model(self, max_tokens_per_batch):
                 mha,
                 self.llama_config.rms_norm_eps,
                 self.llama_config.hidden_size,
-                name=f"layers_{i}_ffn_norm",
+                name=f"layers.{i}.post_attention_layernorm",
             )
             w1 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w1",
+                name=f"layers.{i}.mlp.gate_proj",
             )
             w3 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w3",
+                name=f"layers.{i}.mlp.up_proj",
             )
             multi = ffmodel.sigmoid_silu_multi(w1, w3)
             w2 = ffmodel.dense(
@@ -210,7 +210,7 @@ def build_model(self, max_tokens_per_batch):
                 self.llama_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w2",
+                name=f"layers.{i}.mlp.down_proj",
             )
 
         _, token = ffmodel.residual_rms_norm(
@@ -225,7 +225,7 @@ def build_model(self, max_tokens_per_batch):
             self.llama_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="output",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -246,23 +246,7 @@ def build_model(self, max_tokens_per_batch):
         self.ffmodel = ffmodel
 
     def convert_hf_weight_name(name):
-        return (
-            name.replace(".", "_")
-            .replace("self_attn", "attention")
-            .replace("q_proj", "wq")
-            .replace("k_proj", "wk")
-            .replace("v_proj", "wv")
-            .replace("o_proj", "wo")
-            .replace("mlp", "feed_forward")
-            .replace("gate_proj", "w1")
-            .replace("down_proj", "w2")
-            .replace("up_proj", "w3")
-            .replace("input_layernorm", "attention_norm")
-            .replace("post_attention_layernorm", "ffn_norm")
-            .replace("embed_tokens", "tok_embeddings")
-            .replace("lm_head", "output")
-            .replace("model_", "")
-        )
+        return name.replace("model.", "")
 
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 9168932ce1..76f7d69c73 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -92,7 +92,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
 
         axes = [
@@ -109,7 +109,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
             else:
                 hidden_states, layernorm_output = ffmodel.residual_layer_norm(
@@ -121,7 +121,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -143,7 +143,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention_verify(
@@ -164,7 +164,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention(
@@ -185,7 +185,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             else:
                 assert False
@@ -199,7 +199,7 @@ def build_model(self, max_tokens_per_batch):
                 True,
                 1e-05,
                 False,
-                name=f"layers_{i}_norm_2",
+                name=f"layers.{i}.norm_2",
             )
             # mlp
             layernorm_output = ffmodel.dense(
@@ -207,7 +207,7 @@ def build_model(self, max_tokens_per_batch):
                 4 * self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_up_proj",
+                name=f"layers.{i}.ffn.up_proj",
             )
             layernorm_output = ffmodel.gelu(layernorm_output)
             intermediate_output = ffmodel.dense(
@@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch):
                 self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_down_proj",
+                name=f"layers.{i}.ffn.down_proj",
             )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -227,7 +227,7 @@ def build_model(self, max_tokens_per_batch):
             True,
             1e-05,
             False,
-            name=f"transformer_norm_f",
+            name=f"norm_f",
         )
         lm_head = ffmodel.dense(
             all_final_norm,
@@ -252,8 +252,8 @@ def build_model(self, max_tokens_per_batch):
     def convert_hf_weight_name(name):
         return (
             name.replace("transformer.blocks.", "layers.")
-            .replace(".", "_")
-            .replace("attn_out_proj", "attention_wo")
+            .replace("transformer.", "")
+            .replace("attn.out_proj", "attn.o_proj")
         )
 
     def convert_hf_model(model, dst_folder):
@@ -261,9 +261,9 @@ def convert_hf_model(model, dst_folder):
         for name, params in model.named_parameters():
             name = FlexFlowMPT.convert_hf_weight_name(name)
             if "Wqkv" in name:
-                name_q = name.replace("attn_Wqkv", "attention_wq")
-                name_k = name.replace("attn_Wqkv", "attention_wk")
-                name_v = name.replace("attn_Wqkv", "attention_wv")
+                name_q = name.replace("attn.Wqkv", "attn.q_proj")
+                name_k = name.replace("attn.Wqkv", "attn.k_proj")
+                name_v = name.replace("attn.Wqkv", "attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -280,6 +280,6 @@ def convert_hf_model(model, dst_folder):
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
 
         shutil.copy(
-            os.path.join(dst_folder, "transformer_wte_weight"),
-            os.path.join(dst_folder, "lm_head_weight"),
+            os.path.join(dst_folder, "wte.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 9a03cf6e78..51c76c520b 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -133,7 +133,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_attention_layer_norm",
+                    name=f"layers.{i}.self_attn_layer_norm",
                 )
             else:
                 hidden_states = ffmodel.add(token, positional_embedding)
@@ -157,7 +157,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multihead_self_attention_verify(
@@ -177,7 +177,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multihead_self_attention(
@@ -197,7 +197,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -209,7 +209,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 self.opt_config.layer_norm_elementwise_affine,
                 1e-05,
-                name=f"layers_{i}_add_bias_residual_layer_norm",
+                name=f"layers.{i}.add_bias_residual_layer_norm",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -220,14 +220,14 @@ def build_model(self, max_tokens_per_batch):
                 self.opt_config.ffn_dim,
                 ActiMode.AC_MODE_RELU,
                 True,
-                name=f"layers_{i}_fc1",
+                name=f"layers.{i}.fc1",
             )
             fc2 = ffmodel.dense(
                 fc1,
                 self.opt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_fc2",
+                name=f"layers.{i}.fc2",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -239,7 +239,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_final_layer_norm",
+                    name=f"layers.{i}.final_layer_norm",
                 )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -257,7 +257,7 @@ def build_model(self, max_tokens_per_batch):
             self.opt_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="embed_tokens_weight_lm_head",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -279,17 +279,11 @@ def build_model(self, max_tokens_per_batch):
 
     def convert_hf_weight_name(name):
         return (
-            name.replace(".", "_")
-            .replace("decoder_", "")
-            .replace("model_", "")
-            .replace("self_attn", "attention")
-            .replace("q_proj", "wq")
-            .replace("k_proj", "wk")
-            .replace("v_proj", "wv")
-            .replace("out_proj", "wo")
-            .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias")
+            name.replace("decoder.", "")
+            .replace("model.", "")
+            .replace("self_attn.wo.bias", "add_bias_residual_layer_norm.attn_bias")
             .replace(
-                "_final_layer_norm", "_add_bias_residual_layer_norm"
+                ".final_layer_norm", ".add_bias_residual_layer_norm"
             )  # important to use the leading "_" to avoid matching the last LayerNorm
         )
 
@@ -300,6 +294,6 @@ def convert_hf_model(model, dst_folder):
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
         # copy embedding weights
         shutil.copy(
-            os.path.join(dst_folder, "embed_tokens_weight"),
-            os.path.join(dst_folder, "embed_tokens_weight_lm_head"),
+            os.path.join(dst_folder, "embed_tokens.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index cd6a7304e6..8ed8fcfa18 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -106,7 +106,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
         positional_embedding = ffmodel.embedding(
             position_tensor,
@@ -116,7 +116,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wpe",
+            name="wpe",
         )
 
         axes = [
@@ -134,7 +134,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_1",
+                name=f"layers.{i}.ln_1",
             )
 
             assert self.mode == InferenceMode.INC_DECODING_MODE
@@ -154,7 +154,7 @@ def build_model(self, max_tokens_per_batch):
                 DataType.DT_NONE,  # data_type
                 None,  # kernel initializer
                 False,  # apply_rotary_embedding
-                name=f"layers_{i}_attention",
+                name=f"layers.{i}.attn.c_attn",
             )
 
             residual, l2_norm = ffmodel.residual_layer_norm(
@@ -166,7 +166,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_2",
+                name=f"layers.{i}.ln_2",
             )
 
             # mlp
@@ -176,7 +176,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_fc",
+                name=f"layers.{i}.mlp.c_fc",
             )
             activation = ffmodel.gelu(c_fc, False)
             c_proj = ffmodel.dense(
@@ -184,7 +184,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_proj",
+                name=f"layers.{i}.mlp.c_proj",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -195,7 +195,7 @@ def build_model(self, max_tokens_per_batch):
             axes,
             True,
             self.starcoder_config.layer_norm_epsilon,
-            name=f"transformer_ln_f",
+            name=f"ln_f",
         )
         lm_head = ffmodel.dense(
             ln_f,
@@ -219,11 +219,11 @@ def build_model(self, max_tokens_per_batch):
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.h", "layers").replace(".", "_")
-            if "c_attn_weight" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            name = name.replace("transformer.h", "layers").replace("transformer", "")
+            if "attn.c_attn.weight" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -236,10 +236,10 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_attn_bias" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            elif "attn.c_attn.bias" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -252,14 +252,14 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_proj_bias" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.bias" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
-            elif "c_proj_weight" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.weight" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
             else:
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index da2f1246a2..e20a8760cf 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -186,9 +186,11 @@ def download_hf_weights_if_needed(self):
             os.path.expanduser(self.cache_path),
             "weights",
             self.model_name.lower(),
-            "full-precision"
-            if self.data_type == DataType.DT_FLOAT
-            else "half-precision",
+            (
+                "full-precision"
+                if self.data_type == DataType.DT_FLOAT
+                else "half-precision"
+            ),
         )
         if self.refresh_cache:
             print(
@@ -219,9 +221,11 @@ def download_hf_weights_if_needed(self):
             hf_model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 trust_remote_code=True,
-                torch_dtype=torch.float32
-                if self.data_type == DataType.DT_FLOAT
-                else torch.float16,
+                torch_dtype=(
+                    torch.float32
+                    if self.data_type == DataType.DT_FLOAT
+                    else torch.float16
+                ),
             )
             # Print log message to notify user download of model has finished
             if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
@@ -575,11 +579,13 @@ def download_hf_config(self):
         print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
         print(f"Saving {self.peft_model_id} configs to file {self.config_path}...")
         with open(self.config_path, "w") as json_file:
+
             class SetEncoder(json.JSONEncoder):
                 def default(self, obj):
                     if isinstance(obj, set):
                         return list(obj)
                     return super().default(obj)
+
             json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder)
 
     def __get_revision_hashes(self, peft_model_id: str):
@@ -619,9 +625,11 @@ def download_hf_weights_if_needed(self):
             os.path.expanduser(self.cache_path),
             "weights",
             self.peft_model_id.lower(),
-            "full-precision"
-            if self.data_type == DataType.DT_FLOAT
-            else "half-precision",
+            (
+                "full-precision"
+                if self.data_type == DataType.DT_FLOAT
+                else "half-precision"
+            ),
         )
         if self.refresh_cache:
             print(
@@ -658,9 +666,11 @@ def download_hf_weights_if_needed(self):
                 self.hf_config.base_model_name_or_path,
                 return_dict=True,
                 trust_remote_code=True,
-                torch_dtype=torch.float32
-                if self.data_type == DataType.DT_FLOAT
-                else torch.float16,
+                torch_dtype=(
+                    torch.float32
+                    if self.data_type == DataType.DT_FLOAT
+                    else torch.float16
+                ),
                 # device_map="auto",
             )
             hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id)

From 583cb28df18726d520d86b0cceb0fd926fbf0bc1 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 20 Feb 2024 18:20:57 +0000
Subject: [PATCH 03/32] fix

---
 src/runtime/file_loader.cc | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index fa19c9b22d..dfa3748b9a 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -219,10 +219,10 @@ void load_attention_weights_v2(DT *ptr,
                                int tensor_parallelism_degree) {
   // layers_0_attention_wq_weight
   // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file};
   int file_index = 0;
 
@@ -409,10 +409,10 @@ void load_attention_weights_quantized(char *ptr,
                                       bool use_full_precision) {
   // layers_0_attention_wq_weight
   // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file, o_file};
 
   int file_index = 0;
@@ -690,7 +690,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
     if (weight_idx > 0) {
       assert(weight_idx == 0 || weight_idx == 1);
       if (weight_filename != "embed_tokens_weight_lm_head") {
-        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+        weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
     }
     load_from_quantized_file(data,
@@ -728,12 +728,9 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
       l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
       l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-    if (weight_filename.find("self_attention") != std::string::npos) {
-      load_attention_weights_multi_query(
-          data, weight_filename, weights_folder, hidden_dim, num_heads);
-    } else if (weight_filename.find("attention") != std::string::npos &&
-               weight_filename.rfind("attention") ==
-                   weight_filename.length() - strlen("attention")) {
+    if (weight_filename.find("attention") != std::string::npos &&
+        weight_filename.rfind("attention") ==
+            weight_filename.length() - strlen("attention")) {
       if (weight_idx == 0) {
         load_attention_weights_v2(data,
                                   num_heads,
@@ -765,7 +762,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     assert(weight_idx >= 0 || weight_idx <= 2);
     weight_filename += (weight_idx == 0)
                            ? "_attn_bias"
-                           : ((weight_idx == 1) ? "_weight" : "_bias");
+                           : ((weight_idx == 1) ? ".weight" : ".bias");
     std::cout << "Loading weight file " << weight_filename << std::endl;
     std::string weight_filepath = join_path({weights_folder, weight_filename});
     load_from_file(data, volume, weight_filepath);
@@ -774,7 +771,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     assert(weight_idx == 0 || weight_idx == 1);
     // handle exception
     if (weight_filename != "embed_tokens_weight_lm_head") {
-      weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+      weight_filename += weight_idx == 0 ? ".weight" : ".bias";
     }
     std::cout << "Loading weight file " << weight_filename << std::endl;
     std::string weight_filepath = join_path({weights_folder, weight_filename});

From 38ed49c1fd09898e4a058f1554cc1cce731eca26 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 21 Feb 2024 21:23:05 +0000
Subject: [PATCH 04/32] fix

---
 src/runtime/file_loader.cc | 49 ++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index dfa3748b9a..596c441123 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -728,35 +728,28 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
       l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
       l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-    if (weight_filename.find("attention") != std::string::npos &&
-        weight_filename.rfind("attention") ==
-            weight_filename.length() - strlen("attention")) {
-      if (weight_idx == 0) {
-        load_attention_weights_v2(data,
-                                  num_heads,
-                                  num_kv_heads,
-                                  hidden_dim,
-                                  qkv_inner_dim,
-                                  weight_filename,
-                                  weights_folder,
-                                  volume,
-                                  tensor_parallelism_degree);
-      } else {
-        long long value;
-        l->get_int_property("final_bias", value);
-        bool final_bias = (bool)value;
-        load_attention_bias_v2(data,
-                               num_heads,
-                               num_kv_heads,
-                               hidden_dim,
-                               qkv_inner_dim,
-                               final_bias,
-                               weight_filename,
-                               weights_folder);
-      }
-
+    if (weight_idx == 0) {
+      load_attention_weights_v2(data,
+                                num_heads,
+                                num_kv_heads,
+                                hidden_dim,
+                                qkv_inner_dim,
+                                weight_filename,
+                                weights_folder,
+                                volume,
+                                tensor_parallelism_degree);
     } else {
-      assert(false);
+      long long value;
+      l->get_int_property("final_bias", value);
+      bool final_bias = (bool)value;
+      load_attention_bias_v2(data,
+                             num_heads,
+                             num_kv_heads,
+                             hidden_dim,
+                             qkv_inner_dim,
+                             final_bias,
+                             weight_filename,
+                             weights_folder);
     }
   } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
     assert(weight_idx >= 0 || weight_idx <= 2);

From 0387d51616573f8c1e946d60f80c976b64d94672 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 21 Feb 2024 23:03:11 +0000
Subject: [PATCH 05/32] fix

---
 python/flexflow/serve/models/opt.py |  3 ++-
 src/runtime/file_loader.cc          | 14 +++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 51c76c520b..f725a08e65 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -281,7 +281,8 @@ def convert_hf_weight_name(name):
         return (
             name.replace("decoder.", "")
             .replace("model.", "")
-            .replace("self_attn.wo.bias", "add_bias_residual_layer_norm.attn_bias")
+            .replace("self_attn.out_proj", "self_attn.o_proj")
+            .replace("self_attn.o_proj.bias", "add_bias_residual_layer_norm.attn_bias")
             .replace(
                 ".final_layer_norm", ".add_bias_residual_layer_norm"
             )  # important to use the leading "_" to avoid matching the last LayerNorm
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 596c441123..835012edc1 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -136,12 +136,12 @@ void load_attention_bias_v2(DT *ptr,
                             bool final_bias,
                             std::string layer_name,
                             std::string weights_folder) {
-  std::string q_file = layer_name + "_wq_bias";
-  std::string k_file = layer_name + "_wk_bias";
-  std::string v_file = layer_name + "_wv_bias";
+  std::string q_file = layer_name + ".q_proj.bias";
+  std::string k_file = layer_name + ".k_proj.bias";
+  std::string v_file = layer_name + ".v_proj.bias";
   std::vector<std::string> bias_files = {q_file, k_file, v_file};
   if (final_bias) {
-    std::string o_file = layer_name + "_wo_bias";
+    std::string o_file = layer_name + ".o_proj.bias";
     bias_files.push_back(o_file);
   }
 
@@ -217,8 +217,6 @@ void load_attention_weights_v2(DT *ptr,
                                std::string weights_folder,
                                size_t volume,
                                int tensor_parallelism_degree) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
   std::string q_file = layer_name + ".q_proj.weight";
   std::string k_file = layer_name + ".k_proj.weight";
   std::string v_file = layer_name + ".v_proj.weight";
@@ -407,8 +405,6 @@ void load_attention_weights_quantized(char *ptr,
                                       std::string weights_folder,
                                       DataType data_type,
                                       bool use_full_precision) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
   std::string q_file = layer_name + ".q_proj.weight";
   std::string k_file = layer_name + ".k_proj.weight";
   std::string v_file = layer_name + ".v_proj.weight";
@@ -754,7 +750,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
     assert(weight_idx >= 0 || weight_idx <= 2);
     weight_filename += (weight_idx == 0)
-                           ? "_attn_bias"
+                           ? ".attn_bias"
                            : ((weight_idx == 1) ? ".weight" : ".bias");
     std::cout << "Loading weight file " << weight_filename << std::endl;
     std::string weight_filepath = join_path({weights_folder, weight_filename});

From 4f59521b2d8b41aacb9567daae7561d358ef036a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 22 Feb 2024 03:56:30 +0000
Subject: [PATCH 06/32] backup

---
 python/flexflow/serve/serve.py | 172 ++++++++++++++++++++++++---------
 1 file changed, 129 insertions(+), 43 deletions(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index e20a8760cf..f052b21033 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -199,7 +199,7 @@ def download_hf_weights_if_needed(self):
             if os.path.exists(self.weights_path):
                 shutil.rmtree(self.weights_path)
         os.makedirs(self.weights_path, exist_ok=True)
-        print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
+        #print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
 
         ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
             self.model_name, weights=True
@@ -451,17 +451,6 @@ def stop_server(self):
         self.rm.stop_server()
         print("Background server stopped.")
 
-    def __enter__(self):
-        # Start the server when entering the context
-        # self.rm.start_server(self.model.ffmodel)
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        # Stop the server when exiting the context
-        # self.rm.stop_server()
-        if exc_type:
-            print(f"Exception occurred: {exc_value}")
-
 
 class SSM(LLM):
     """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace"""
@@ -539,18 +528,20 @@ def compile(
         )
 
 
-class PEFT:
+class PEFT(LLM):
     """This class creates a PEFT (parameter-efficient transformer) object to be used in concert with a LLM or SSM"""
 
     def __init__(
         self,
+        base_model: LLM,
         peft_model_id: str,
+        config: PeftConfig = None,
         data_type: DataType = DataType.DT_HALF,
         cache_path: str = "",
         refresh_cache: bool = False,
     ):
-        self.hf_config = PeftConfig.from_pretrained(peft_model_id)
         self.peft_model_id = peft_model_id
+        self.hf_config = config if config is not None else PeftConfig.from_pretrained(peft_model_id)
         self.peft_type = self.hf_config.peft_type
         if self.peft_type != "LORA":
             raise RuntimeError(
@@ -565,9 +556,9 @@ def __init__(
             raise ValueError(
                 f"PEFT model {peft_model_id} does not have an associated based model"
             )
-        self.base_model = LLM(
-            self.hf_config.base_model_name_or_path, data_type, cache_path, refresh_cache
-        )
+        self.base_model = base_model
+        if refresh_cache:
+            self.base_model.refresh_cache = True
 
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
@@ -587,25 +578,11 @@ def default(self, obj):
                     return super().default(obj)
 
             json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder)
+        
+        self.base_model.download_hf_config()
 
     def __get_revision_hashes(self, peft_model_id: str):
-        ff_revision = None
-        ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt")
-        if os.path.exists(ff_revision_file):
-            ff_revision = "".join(open(ff_revision_file).read().split())
-
-        if os.path.exists(peft_model_id) and os.path.isdir(peft_model_id):
-            # Local model
-            files = os.listdir(peft_model_id)
-            state = files + [
-                os.path.getmtime(os.path.join(peft_model_id, f)) for f in files
-            ]
-            latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest()
-        else:
-            # Remote HuggingFace model
-            hf_api = HfApi()
-            latest_revision = hf_api.model_info(self.peft_model_id).sha
-        return ff_revision, ff_revision_file, latest_revision
+        return super().__get_revision_hashes(peft_model_id, weights=True)
 
     def convert_peft_model(self, hf_peft_model, weights_path):
         for name, params in hf_peft_model.named_parameters():
@@ -620,6 +597,8 @@ def download_hf_weights_if_needed(self):
         """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date.
         If not, or if the refresh_cache parameter is set to True, download new weights.
         """
+        self.base_model.download_hf_weights_if_needed()
+        
         # Use local cache, or download new version
         self.weights_path = os.path.join(
             os.path.expanduser(self.cache_path),
@@ -638,7 +617,7 @@ def download_hf_weights_if_needed(self):
             if os.path.exists(self.weights_path):
                 shutil.rmtree(self.weights_path)
         os.makedirs(self.weights_path, exist_ok=True)
-        print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
+        #print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
 
         ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
             self.peft_model_id
@@ -649,21 +628,15 @@ def download_hf_weights_if_needed(self):
             if not os.path.exists(self.peft_model_id) or os.path.isdir(
                 self.peft_model_id
             ):
-                # Local model
                 print(
                     f"'{self.peft_model_id}' model weights not found in cache or outdated. Downloading from huggingface.co ..."
                 )
             else:
-                # Remote model
                 print(
                     f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..."
                 )
-            # Download base model from HuggingFace, or load it from the local folder
-            self.base_model.download_hf_weights_if_needed()
-            self.base_model.download_hf_tokenizer_if_needed()
-            self.base_model.download_hf_config()
             hf_base_model = AutoModelForCausalLM.from_pretrained(
-                self.hf_config.base_model_name_or_path,
+                self.base_model.model_name,
                 return_dict=True,
                 trust_remote_code=True,
                 torch_dtype=(
@@ -673,7 +646,7 @@ def download_hf_weights_if_needed(self):
                 ),
                 # device_map="auto",
             )
-            hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id)
+            hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id, config=self.hf_config)
             # Print log message to notify user download of model has finished
             if not os.path.exists(self.peft_model_id) or os.path.isdir(
                 self.peft_model_id
@@ -692,3 +665,116 @@ def download_hf_weights_if_needed(self):
             torch.cuda.empty_cache()
         else:
             print(f"Loading '{self.peft_model_id}' model weights from the cache...")
+
+    def download_hf_tokenizer_if_needed(self):
+        self.base_model.download_hf_tokenizer_if_needed()
+
+    def compile(
+        self,
+        generation_config: GenerationConfig = GenerationConfig(),
+        max_requests_per_batch: int = 1,
+        max_seq_length: int = 256,
+        max_tokens_per_batch: int = 64,
+        model_specific_data_parallelism_degree: int = None,
+        model_specific_tensor_parallelism_degree: int = None,
+        model_specific_pipeline_parallelism_degree: int = None,
+        ssms: list = [],
+    ):
+        self.base_model.ssms = ssms
+        self.base_model.generation_config = GenerationConfig()
+        self.base_model.ffconfig = FFConfig()
+        if len(ssms) > 0:
+            assert type(self.base_model) == LLM
+            mode = InferenceMode.TREE_VERIFY_MODE
+        elif type(self.base_model) == SSM:
+            mode = InferenceMode.BEAM_SEARCH_MODE
+        else:
+            assert type(self.base_model) == LLM
+            mode = InferenceMode.INC_DECODING_MODE
+
+        # Apply model-specific parallelism degrees, if needed
+        if model_specific_data_parallelism_degree:
+            self.base_model.ffconfig.data_parallelism_degree = (
+                model_specific_data_parallelism_degree
+            )
+        if model_specific_tensor_parallelism_degree:
+            self.base_model.ffconfig.tensor_parallelism_degree = (
+                model_specific_tensor_parallelism_degree
+            )
+        if model_specific_pipeline_parallelism_degree:
+            self.base_model.ffconfig.pipeline_parallelism_degree = (
+                model_specific_pipeline_parallelism_degree
+            )
+
+        # Create request manager and set serving configuration
+        self.base_model.rm = RequestManager()
+        self.base_model.rm.set_max_requests_per_batch(max_requests_per_batch)
+        self.base_model.rm.set_max_tokens_per_batch(max_tokens_per_batch)
+        self.base_model.rm.set_max_sequence_length(max_seq_length)
+
+        # Instantiate the relevant model
+        self.base_model.model = self.model_class(
+            mode,
+            generation_config,
+            self.base_model.ffconfig,
+            self.base_model.hf_config,
+            self.base_model.data_type,
+            max_tokens_per_batch,
+        )
+
+        # TODO: add linear layers
+
+        # Download the weights from huggingface (if needed)
+        self.download_hf_weights_if_needed()
+
+        # Create file data loader, load weights into tensors
+        model_configs = self.base_model.config_class(self.base_model.hf_config)
+
+        self.fileloader = FileDataLoader(
+            self.weights_path,
+            model_configs.num_attention_heads,
+            model_configs.num_key_value_heads,
+            model_configs.hidden_size,
+            model_configs.hidden_size // model_configs.num_attention_heads,
+            self.ffconfig.tensor_parallelism_degree,
+            self.data_type == DataType.DT_FLOAT,
+        )
+
+        # Register weights file loader
+        self.im = InferenceManager()
+        self.im.register_model_weights_loader(self.model.ffmodel, self.fileloader)
+
+        # Download the tokenizer from huggingface (if needed) and load them
+        self.download_hf_tokenizer_if_needed()
+
+        # Create tokenizer (this must be done after we have downloaded the tokenizer
+        bos_token_id = (
+            -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id
+        )
+        eos_token_id = (
+            -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id
+        )
+        self.rm.register_tokenizer(
+            self.model_type, bos_token_id, eos_token_id, self.tokenizer_path
+        )
+        self.rm.register_output_filepath(self.output_file)
+
+        for ssm in self.ssms:
+            self.rm.register_ssm_model(ssm.model.ffmodel)
+
+        # start background server
+        if (mode == InferenceMode.TREE_VERIFY_MODE) or (
+            mode == InferenceMode.INC_DECODING_MODE
+        ):
+            import atexit
+
+            atexit.register(self.rm.stop_server)
+
+    def generate(self, prompts: Union[str, List[str]], max_length: int = 128):
+        super().generate(prompts, max_length)
+
+    def start_server(self):
+        self.base_model.start_server()
+    
+    def stop_server(self):
+        self.base_model.stop_server()

From 37451d6a187d5a8f1205825a2a1988f1130d0110 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 23 Feb 2024 02:43:12 +0000
Subject: [PATCH 07/32] .

---
 include/flexflow/ffconst.h                |  3 +-
 include/flexflow/model.h                  |  7 ++---
 include/flexflow/ops/lora_linear_params.h |  7 +++--
 inference/incr_decoding/incr_decoding.cc  | 33 +++++++++++++-------
 inference/models/llama.cc                 |  7 +----
 inference/models/opt.cc                   |  6 +---
 python/flexflow/serve/serve.py            |  2 +-
 src/ops/fused.cu                          |  6 ++--
 src/ops/lora_linear.cc                    | 26 ++++++++++++++--
 src/ops/lora_linear_params.cc             | 37 +++++++++++++++++------
 src/runtime/ffconst_utils.cc              |  6 ++--
 src/runtime/file_loader.cc                |  2 +-
 src/runtime/graph.cc                      |  3 +-
 src/runtime/model.cc                      |  4 +--
 src/runtime/request_manager.cc            | 26 ++++++++--------
 15 files changed, 106 insertions(+), 69 deletions(-)

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index fb12adf2d3..66e252db46 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -179,8 +179,7 @@ enum OperatorType {
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SAMPLING,
   // PEFT Ops
-  OP_LORA_MLP_FIRST,
-  OP_LORA_MLP_SECOND,
+  OP_LORA,
   // Parallel Ops
   OP_REPARTITION,
   OP_COMBINE,
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 34ace0c5dc..2dd0dbf686 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -837,18 +837,16 @@ class FFModel {
   // ========================================
   // PEFT Layers
   // ========================================
+  void add_lora_layer(std::string target_module_name, char const *name);
   void lora_linear(Tensor const input,
                    Tensor const output,
-                   OperatorType _type,
                    char const *name = nullptr);
   // ========================================
   // Inference APIs
   // ========================================
   std::vector<GenerationResult> generate(std::vector<Request> const &requests);
 
-  PEFTModelID register_peft_model(
-      LoraLinearConfig const mlp_first = LoraLinearConfig::DefaultConfig,
-      LoraLinearConfig const mlp_second = LoraLinearConfig::DefaultConfig);
+  PEFTModelID register_peft_model(LoraLinearConfig const peft_config);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
@@ -1173,6 +1171,7 @@ class FFModel {
 
   std::vector<Layer *> layers;
   std::vector<Op *> operators;
+  std::vector<Op *> peft_operators;
   std::vector<ParallelTensor> parameters;
   FFHandler handlers[MAX_NUM_WORKERS];
   Legion::Future current_metrics;
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index e82243fd67..acbd9c3c67 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -12,12 +12,12 @@ namespace FlexFlow {
 
 class LoraLinearConfig {
 public:
-  static const LoraLinearConfig DefaultConfig;
+  static const LoraLinearConfig EmptyConfig;
   LoraLinearConfig();
   LoraLinearConfig(int rank,
                    OptimizerType type = OPTIMIZER_TYPE_SGD,
                    float learning_rate = 1e-4);
-  LoraLinearConfig(std::string const &cache_folder_,
+  LoraLinearConfig(std::string const &config_folder_,
                    std::string const &peft_model_id_);
   friend bool operator==(LoraLinearConfig const &lhs,
                          LoraLinearConfig const &rhs);
@@ -28,11 +28,12 @@ class LoraLinearConfig {
   int rank;
   OptimizerType optimizer_type;
   float learning_rate;
-  std::string cache_folder;
+  std::string config_folder;
   // Huggingface
   std::string peft_model_id;
   int lora_alpha;
   float lora_dropout;
+  std::vector<std::string> target_modules;
   // whether to load weights from file, instead of initializing them randomly
   bool load_weights_from_file;
 };
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index d376c3e39c..502aa7fc6c 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -44,6 +44,7 @@ void parse_input_args(char **argv,
                       bool &use_full_precision,
                       bool &verbose,
                       bool &do_sample,
+                      bool &enable_peft,
                       float &temperature,
                       float &topp,
                       int &max_requests_per_batch,
@@ -58,6 +59,10 @@ void parse_input_args(char **argv,
       }
       continue;
     }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
     if (!strcmp(argv[i], "-peft-model")) {
       peft_model_name = std::string(argv[++i]);
       for (char &c : peft_model_name) {
@@ -137,6 +142,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
+  bool enable_peft = false;
   float temperature = 0.0f;
   float topp = 0.0f;
   int max_requests_per_batch = 8;
@@ -178,6 +184,14 @@ void FlexFlow::top_level_task(Task const *task,
               << std::endl;
     assert(false);
   }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+  
   json model_config = json::parse(config_file_handle,
                                   /*parser_callback_t */ nullptr,
                                   /*allow_exceptions */ true,
@@ -212,6 +226,9 @@ void FlexFlow::top_level_task(Task const *task,
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
 
+  // load PEFT config
+  LoraLinearConfig peft_config = peft_model_name.empty() ? LoraLinearConfig::EmptyConfig : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
   GenerationConfig generationConfig(do_sample, temperature, topp);
   RequestManager *rm = RequestManager::get_request_manager();
   rm->set_max_requests_per_batch(max_requests_per_batch);
@@ -259,17 +276,11 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
-  // Register PEFT layer
-  LoraLinearConfig mlp_second =
-      peft_model_name.empty()
-          ? LoraLinearConfig::DefaultConfig
-          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
-  PEFTModelID peft_model_id =
-      peft_model_name.empty()
-          ? PEFTModelID::NO_ID
-          : model.register_peft_model(
-                LoraLinearConfig::DefaultConfig /*mlp_first*/,
-                mlp_second /*mlp_second*/);
+  // Add PEFT layer
+  PEFTModelID peft_model_id = PEFTModelID::NO_ID;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.register_peft_model(peft_config);
+  }
 
   // Start background server
   rm->start_background_server(&model);
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index a7a1758cc3..fd788fa904 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -224,12 +224,7 @@ void LLAMA::create_llama_model(FFModel &ff,
         0.0f,
         std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
-    ff.lora_linear(
-        multi,
-        w2,
-        OP_LORA_MLP_SECOND,
-        std::string("layers." + std::to_string(i) + ".mlp.down_proj.lora")
-            .c_str());
+    // ff.lora_linear(std::string("down_proj"), std::string("layers." + std::to_string(i) + ".mlp.down_proj.lora").c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 6d04ba47f2..bc22e1a8b7 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -220,11 +220,7 @@ void OPT::create_opt_model(FFModel &ff,
                    0.0f,
                    std::string("layers." + std::to_string(i) + ".fc2").c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
-    ff.lora_linear(
-        fc1,
-        fc2,
-        OP_LORA_MLP_SECOND,
-        std::string("layers." + std::to_string(i) + ".fc2.lora").c_str());
+    // ff.lora_linear(std::string("fc2"), std::string("layers." + std::to_string(i) + ".fc2.lora").c_str());
   }
 
   // final
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index f052b21033..7e63b5055c 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -722,7 +722,7 @@ def compile(
             max_tokens_per_batch,
         )
 
-        # TODO: add linear layers
+        # TODO: add peft layers
 
         # Download the weights from huggingface (if needed)
         self.download_hf_weights_if_needed()
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 574fbcb573..aca93a973d 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -266,8 +266,7 @@ __host__ void
                                                 batch_size);
         break;
       }
-      case OP_LORA_MLP_FIRST:
-      case OP_LORA_MLP_SECOND: {
+      case OP_LORA: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 1);
         Domain input_domain = my_input_accessor[0].domain;
@@ -910,8 +909,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
                                                  num_peft_tokens);
         break;
       }
-      case OP_LORA_MLP_FIRST:
-      case OP_LORA_MLP_SECOND: {
+      case OP_LORA: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 1);
         Domain input_domain = my_input_grad_accessor[0].domain;
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 366eca27b7..906bb91b6c 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -38,14 +38,36 @@ using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::LoraLinear;
 
+void FFModel::add_lora_layer(std::string target_module_name,
+                            char const *name) {
+  assert(target_module_name.length() > 0 && "LoRA target module name is empty");
+  
+  // find target layer, and ensure uniqueness.
+  // if the target layer already has a LoRA layer, no need to add it again (keep track of layers with lora)
+  Layer *target_module = nullptr;
+  for (Layer *it : layers) {
+    if (it->op_type == OP_LINEAR && it->name != nullptr && strlen(it->name) > 0) {
+      std::string s(it->name);
+      if (s.find(target_module_name) != string::npos) {
+        // Check that this is the only layer with target name
+        if (target_module != nullptr) {
+          fprintf(stderr, "Error, found two layers containing LoRA target module name '%s'. Layer 1: %s, Layer 2: %s\n",
+          target_module_name.c_str(), target_module->name, it->name);
+        }
+        target_module = it;
+      }
+    }
+  }
+  lora_linear(target_module->inputs[0], target_module->outputs[0], name);
+}
+
 void FFModel::lora_linear(Tensor const input,
                           Tensor const output,
-                          OperatorType op_type,
                           char const *name) {
   assert(input->data_type == output->data_type);
   Layer *lora = nullptr;
   lora = new Layer(this,
-                   op_type,
+                   OP_LORA,
                    output->data_type,
                    name,
                    2 /*inputs*/,
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 9d797aaed2..0edeb03d2f 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -5,24 +5,24 @@
 using json = nlohmann::json;
 
 namespace FlexFlow {
-const LoraLinearConfig LoraLinearConfig::DefaultConfig = LoraLinearConfig();
+const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig();
 
 LoraLinearConfig::LoraLinearConfig()
     : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f),
-      cache_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f),
+      config_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f),
       load_weights_from_file(false) {}
 
 LoraLinearConfig::LoraLinearConfig(int _rank, OptimizerType _type, float _lr)
-    : rank(_rank), optimizer_type(_type), learning_rate(_lr), cache_folder(""),
+    : rank(_rank), optimizer_type(_type), learning_rate(_lr), config_folder(""),
       peft_model_id(""), lora_alpha(0), lora_dropout(0.0f),
       load_weights_from_file(false) {}
 
-LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_,
+LoraLinearConfig::LoraLinearConfig(std::string const &config_folder_,
                                    std::string const &peft_model_id_) {
-  cache_folder = cache_folder_;
+  config_folder = config_folder_;
   peft_model_id = peft_model_id_;
   std::string peft_inference_config_file_path =
-      join_path({cache_folder, "configs", peft_model_id, "config.json"});
+      join_path({config_folder, peft_model_id, "config.json"});
   std::ifstream config_file(peft_inference_config_file_path);
   if (config_file.is_open()) {
     try {
@@ -31,6 +31,9 @@ LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_,
       rank = model_config["r"];
       lora_alpha = model_config["lora_alpha"];
       lora_dropout = model_config["lora_dropout"];
+      for (auto &s : model_config["target_modules"]) {
+        target_modules.push_back(s); 
+      }
     } catch (json::exception const &e) {
       std::cerr << "Error parsing PEFT config from JSON file: " << e.what()
                 << std::endl;
@@ -48,21 +51,37 @@ LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_,
 
 bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) {
   if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type &&
-      lhs.learning_rate == rhs.learning_rate) {
+      lhs.learning_rate == rhs.learning_rate && lhs.config_folder == rhs.config_folder &&
+      lhs.peft_model_id == rhs.peft_model_id && lhs.lora_alpha == rhs.lora_alpha &&
+      lhs.lora_dropout == rhs.lora_dropout && lhs.target_modules.size() == rhs.target_modules.size() &&
+      lhs.load_weights_from_file == rhs.load_weights_from_file) {
+    for (int i=0; i<lhs.target_modules.size(); i++) {
+      if (lhs.target_modules[i] != rhs.target_modules[i]) {
+        return false;
+      }
+    }
     return true;
   }
   return false;
 }
 
 std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
-  os << "LoraLinearConfig: ";
+  os << "LoraLinearConfig: {";
   os << "rank: " << llc.rank << ", ";
   os << "optimizer_type: " << llc.optimizer_type << ", ";
   os << "learning_rate: " << llc.learning_rate << ", ";
-  os << "cache_folder: " << llc.cache_folder << ", ";
+  os << "config_folder: " << llc.config_folder << ", ";
   os << "peft_model_id: " << llc.peft_model_id << ", ";
   os << "lora_alpha: " << llc.lora_alpha << ", ";
   os << "lora_dropout: " << llc.lora_dropout << ", ";
+  os << "target_modules: [";
+  for (int i=0; i<target_modules.size(); i++) {
+    os << target_modules[i];
+    if (i < target_modules.size() - 1) {
+      os << ", ";
+    }
+  }
+  os << "], ";
   os << "load_weights_from_file: " << llc.load_weights_from_file << std::endl;
   return os;
 }
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index 3ee1ee62df..33e11bf451 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -189,10 +189,8 @@ std::string get_operator_type_name(OperatorType type) {
     case OP_ARGMAX:
       return "ArgMax";
     // PEFT Ops
-    case OP_LORA_MLP_FIRST:
-      return "Lora MLP First Layer";
-    case OP_LORA_MLP_SECOND:
-      return "Lora MLP Second Layer";
+    case OP_LORA:
+      return "Lora Layer";
     // Parallel Ops
     case OP_REPARTITION:
       return "Repartition";
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 835012edc1..84554c2bd4 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -787,7 +787,7 @@ void FileDataLoader::load_weights(FFModel *ff) {
         continue;
       }
       // TODO: currently skip Lora layers
-      if (l->op_type == OP_LORA_MLP_FIRST || l->op_type == OP_LORA_MLP_SECOND) {
+      if (l->op_type == OP_LORA) {
         continue;
       }
       switch (weight->data_type) {
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 31cf3bb6a7..dae0021bb6 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2764,8 +2764,7 @@ void FFModel::deserialize_graph_optimal_view(
         node = Linear::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
-      case OP_LORA_MLP_FIRST:
-      case OP_LORA_MLP_SECOND: {
+      case OP_LORA: {
         node = LoraLinear::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index a64fb8ec9c..92340a92db 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3308,10 +3308,10 @@ Op *FFModel::create_operator_from_layer(
       return op;
     }
     // PEFT layers
-    case OP_LORA_MLP_FIRST:
-    case OP_LORA_MLP_SECOND: {
+    case OP_LORA: {
       Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs);
       operators.push_back(op);
+      peft_operators.push_back(op);
       return op;
     }
     default:
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 41c371d4e2..3d71fa1e6b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2488,25 +2488,25 @@ std::string find_layer_name_from_guid(FFModel *model, LayerID guid) {
 
 bool is_peft_operator_type(OperatorType type) {
   switch (type) {
-    case OP_LORA_MLP_FIRST:
-    case OP_LORA_MLP_SECOND:
+    case OP_LORA:
       return true;
     default:
       return false;
   }
 }
 
-PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first,
-                                         LoraLinearConfig const mlp_second) {
-  if (!(mlp_first == LoraLinearConfig::DefaultConfig &&
-        mlp_second == LoraLinearConfig::DefaultConfig)) {
-    if (!config.enable_peft) {
-      fprintf(stderr,
-              "Error: trying to register PEFT model, but peft mode is not "
-              "enabled.\n");
-      assert(false);
-    }
+PEFTModelID FFModel::register_peft_model(LoraLinearConfig const peft_config) {
+  if (peft_config == LoraLinearConfig::EmptyConfig) {
+    fprintf(stderr, "Error: trying to register empty PEFT model\n");
+    assert(false);
+  }
+  if (!config.enable_peft) {
+    fprintf(stderr,
+            "Error: trying to register PEFT model, but peft mode is not "
+            "enabled.\n");
+    assert(false);
   }
+  
   PEFTModelID peft_model_id(peft_model_global_guid++);
   InferenceManager *im = InferenceManager::get_inference_manager();
   std::vector<Op *> peft_operators;
@@ -2526,7 +2526,7 @@ PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first,
     std::string layer_name =
         find_layer_name_from_guid(this, peft_operators[op]->layer_guid);
     switch (peft_operators[op]->op_type) {
-      case OP_LORA_MLP_FIRST: {
+      case OP_LORA: {
         if (mlp_first == LoraLinearConfig::DefaultConfig) {
           // Do nothing for the default configuration
           continue;

From 50bee5418f2b5c5f52fd633e6703d6ca54a61154 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 23 Feb 2024 22:04:28 +0000
Subject: [PATCH 08/32] update

---
 include/flexflow/model.h                  |  12 ++-
 include/flexflow/ops/lora_linear.h        |   3 +
 include/flexflow/ops/lora_linear_params.h |   1 +
 inference/incr_decoding/incr_decoding.cc  |   2 +-
 src/ops/lora_linear.cc                    | 119 ++++++++++++++++------
 5 files changed, 99 insertions(+), 38 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 2dd0dbf686..85b72505f6 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -837,10 +837,7 @@ class FFModel {
   // ========================================
   // PEFT Layers
   // ========================================
-  void add_lora_layer(std::string target_module_name, char const *name);
-  void lora_linear(Tensor const input,
-                   Tensor const output,
-                   char const *name = nullptr);
+  PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config);
   // ========================================
   // Inference APIs
   // ========================================
@@ -1171,8 +1168,13 @@ class FFModel {
 
   std::vector<Layer *> layers;
   std::vector<Op *> operators;
-  std::vector<Op *> peft_operators;
   std::vector<ParallelTensor> parameters;
+  // PEFT related
+  std::unordered_map<Layer*, Layer*> base_layer_to_peft_layer;
+  std::unordered_map<Layer*, std::vector<PEFTModelID>> peft_layer_to_peft_id;
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+//   std::vector<Op *> peft_operators;
+  
   FFHandler handlers[MAX_NUM_WORKERS];
   Legion::Future current_metrics;
   // Cached operators: key: operator hash, value: operator pointer
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index b9aabdd1aa..0aa14f9d39 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -22,6 +22,7 @@ class LoraLinear : public Op {
              OperatorType type,
              ParallelTensor const input,
              ParallelTensor const output,
+             std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
              char const *name = nullptr);
   LoraLinear(FFModel &model,
              LoraLinear const &other,
@@ -98,6 +99,8 @@ class LoraLinear : public Op {
                   int num_inputs) const override;
   // size_t get_params_hash() const override;
   LoraLinearParams get_params() const;
+
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index acbd9c3c67..dfc78d0683 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -42,6 +42,7 @@ class LoraLinearParams {
 public:
   LayerID layer_guid;
   OperatorType type;
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
   char name[MAX_OPNAME];
 
   bool is_valid(std::pair<ParallelTensorShape, ParallelTensorShape> const
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 502aa7fc6c..6d1af3c17c 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -279,7 +279,7 @@ void FlexFlow::top_level_task(Task const *task,
   // Add PEFT layer
   PEFTModelID peft_model_id = PEFTModelID::NO_ID;
   if (!peft_model_name.empty()) {
-    peft_model_id = model.register_peft_model(peft_config);
+    peft_model_id = model.add_lora_layer(peft_config);
   }
 
   // Start background server
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 906bb91b6c..5281e0df65 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -38,13 +38,14 @@ using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::LoraLinear;
 
-void FFModel::add_lora_layer(std::string target_module_name,
-                            char const *name) {
+PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
+  assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled");
   assert(target_module_name.length() > 0 && "LoRA target module name is empty");
-  
+
   // find target layer, and ensure uniqueness.
   // if the target layer already has a LoRA layer, no need to add it again (keep track of layers with lora)
   Layer *target_module = nullptr;
+  int idx;
   for (Layer *it : layers) {
     if (it->op_type == OP_LINEAR && it->name != nullptr && strlen(it->name) > 0) {
       std::string s(it->name);
@@ -53,50 +54,69 @@ void FFModel::add_lora_layer(std::string target_module_name,
         if (target_module != nullptr) {
           fprintf(stderr, "Error, found two layers containing LoRA target module name '%s'. Layer 1: %s, Layer 2: %s\n",
           target_module_name.c_str(), target_module->name, it->name);
+          assert(false);
         }
         target_module = it;
       }
     }
+    idx++;
   }
-  lora_linear(target_module->inputs[0], target_module->outputs[0], name);
-}
-
-void FFModel::lora_linear(Tensor const input,
-                          Tensor const output,
-                          char const *name) {
-  assert(input->data_type == output->data_type);
-  Layer *lora = nullptr;
-  lora = new Layer(this,
-                   OP_LORA,
-                   output->data_type,
-                   name,
-                   2 /*inputs*/,
-                   0 /*weights*/,
-                   1 /*outputs*/,
-                   input,
-                   output);
-  {
-    int numdims = output->num_dims;
-    int dims[MAX_TENSOR_DIM];
-    for (int i = 0; i < numdims; i++) {
-      dims[i] = output->dims[i];
+  PEFTModelID peft_model_id(peft_model_global_guid++);
+  peft_configs[peft_model_id] = peft_config;
+
+  Layer *peft_layer = nullptr;
+  if (base_layer_to_peft_layer.find(target_module) != base_layer_to_peft_layer.end()) {
+    // lora linear layer already added, no need to add again
+    peft_layer = base_layer_to_peft_layer[target_module];
+    peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
+  } else {
+    Tensor const input = target_module->inputs[0];
+    Tensor const output = target_module->outputs[0];
+    assert(input->data_type == output->data_type);
+    std::string name_ = target_module->name + ".lora";
+    Layer *peft_layer = new Layer(this,
+                        OP_LORA,
+                        output->data_type,
+                        name.c_str(),
+                        2 /*inputs*/,
+                        0 /*weights*/,
+                        1 /*outputs*/,
+                        input,
+                        output);
+    {
+      int numdims = output->num_dims;
+      int dims[MAX_TENSOR_DIM];
+      for (int i = 0; i < numdims; i++) {
+        dims[i] = output->dims[i];
+      }
+      peft_layer->outputs[0] = create_tensor_legion_ordering(
+          numdims, dims, output->data_type, peft_layer, 0, true /*create_grad*/);
     }
-    lora->outputs[0] = create_tensor_legion_ordering(
-        numdims, dims, output->data_type, lora, 0, true /*create_grad*/);
+    layers.insert(layers.begin() + idx + 1, peft_layer);
+    
+    base_layer_to_peft_layer[target_module] = peft_layer;
+    peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
+    peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
   }
-  layers.push_back(lora);
+  return peft_model_id;
 }
 
 Op *LoraLinear::create_operator_from_layer(
     FFModel &model,
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
+  std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs,
+  std::vector<PEFTModelID> const &peft_ids = model.peft_layer_to_peft_id[layer];
+  for (int i=0; i<peft_ids.size(); i++) {
+    _peft_configs.emplace(std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]]));
+  }
   return new LoraLinear(model,
                         layer->layer_guid,
                         layer->op_type,
                         inputs[0],
                         inputs[1],
-                        layer->name);
+                        _peft_configs,
+                        layer->name);;
 }
 
 LoraLinear::LoraLinear(FFModel &model,
@@ -104,7 +124,7 @@ LoraLinear::LoraLinear(FFModel &model,
                        ParallelTensor const input,
                        ParallelTensor const output)
     : LoraLinear(
-          model, other.layer_guid, other.op_type, input, output, other.name) {}
+          model, other.layer_guid, other.op_type, input, output, other.peft_configs, other.name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
                        Params const &params,
@@ -115,6 +135,7 @@ LoraLinear::LoraLinear(FFModel &model,
                  params.type,
                  inputs.first,
                  inputs.second,
+                 params.peft_configs,
                  params.name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
@@ -122,6 +143,7 @@ LoraLinear::LoraLinear(FFModel &model,
                        OperatorType _op_type,
                        ParallelTensor const _input,
                        ParallelTensor const _output,
+                       std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs,
                        char const *name)
     : Op(model,
          _op_type,
@@ -151,6 +173,9 @@ LoraLinear::LoraLinear(FFModel &model,
     outputs[0] = model.create_parallel_tensor_legion_ordering(
         numdim, dims, inputs[1]->data_type, this);
   }
+  for (const auto& kv : _peft_configs) {
+    peft_configs.insert(kv); 
+  }
   // assert(check_output_input_weight_parallel_dims(allocate_weights));
 }
 
@@ -783,7 +808,16 @@ bool LoraLinear::measure_operator_cost(Simulator *sim,
 }
 
 bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type;
+  if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && lhs.peft_configs.size() == rhs.peft_configs.size()) {
+    for (const auto& kv : lhs.peft_configs) {
+      auto it = rhs.peft_configs.find(kv.first);
+      if (it == rhs.peft_configs.end() || it->second != kv.second) {
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
 }
 
 void LoraLinear::serialize(Legion::Serializer &sez) const {
@@ -791,6 +825,11 @@ void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.transformer_layer_id);
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->op_type);
+  sez.serialize(this->peft_configs.size());
+  for (const auto& kv : this->peft_configs) {
+    sez.serialize(kv.first);
+    sez.serialize(kv.second);
+  }
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -804,17 +843,28 @@ Node LoraLinear::deserialize(FFModel &ff,
   assert(num_inputs == 2);
   size_t id, transformer_layer_id, deserialized_model_id;
   OperatorType op_type;
+  size_t num_pefts;
+  PEFTModelID peft_model_id;
+  LoraLinearConfig peft_config;
   size_t name_len;
   char name[MAX_OPNAME] = {0};
+  
+  LoraLinearParams params;
+  
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
   dez.deserialize(op_type);
+  dez.deserialize(num_pefts);
+  for (int i=0; i<num_pefts; i++) {
+    dez.deserialize(peft_model_id);
+    dez.deserialize(peft_config);
+    params.peft_configs.emplace(std::make_pair(peft_model_id, peft_config));
+  }
   dez.deserialize(name_len);
   dez.deserialize(name, name_len);
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
 
-  LoraLinearParams params;
   params.layer_guid = layer_guid;
   params.type = op_type;
   strcpy(params.name, name);
@@ -835,6 +885,7 @@ LoraLinearParams LoraLinear::get_params() const {
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
+  params.peft_configs = this->peft_configs;
   return params;
 }
 
@@ -853,6 +904,10 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.layer_guid.transformer_layer_id);
   hash_combine(key, params.layer_guid.model_id);
+  for (const auto& kv : params.peft_configs) {
+    hash_combine(key, kv.first);
+    hash_combine(key, kv.second);
+  }
   return key;
 }
 }; // namespace std

From d2ad61a8e43d9f9f27e1263db2ec060a2a8b4817 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 23 Feb 2024 22:32:31 +0000
Subject: [PATCH 09/32] .

---
 include/flexflow/model.h           |   2 -
 include/flexflow/ops/lora_linear.h |  10 --
 src/ops/lora_linear.cc             | 244 ++++++++++++-----------------
 src/runtime/model.cc               |  16 --
 src/runtime/request_manager.cc     |  91 -----------
 5 files changed, 97 insertions(+), 266 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 85b72505f6..cae888784c 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -843,8 +843,6 @@ class FFModel {
   // ========================================
   std::vector<GenerationResult> generate(std::vector<Request> const &requests);
 
-  PEFTModelID register_peft_model(LoraLinearConfig const peft_config);
-
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
                                        DataType data_type,
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index 0aa14f9d39..579d6f06a8 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -40,11 +40,6 @@ class LoraLinear : public Op {
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
-  void register_peft_model(FFModel const &ff,
-                           std::vector<ParallelTensor> const &batch_inputs,
-                           std::vector<ParallelTensor> const &batch_outputs,
-                           PEFTModelID const &model_id,
-                           LoraLinearConfig const lora_config);
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &,
                               std::vector<ParallelTensor> const &,
@@ -65,11 +60,6 @@ class LoraLinear : public Op {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
-  static void
-      register_model_task(Legion::Task const *task,
-                          std::vector<Legion::PhysicalRegion> const &regions,
-                          Legion::Context ctx,
-                          Legion::Runtime *runtime);
   static void inference_task(Legion::Task const *task,
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 5281e0df65..2a9f83e11d 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -266,6 +266,103 @@ OpMeta *LoraLinear::init_task(Task const *task,
   std::strcpy(m->op_name, lora->name);
   m->layer_guid = lora->layer_guid;
 
+  int shard_id = task->index_point.point_data[0];
+  int num_dims = lora->inputs[0]->num_dims;
+  int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree;
+  int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree;
+
+  DataType dt = m->input_type[0];
+  assert(dt == m->input_type[1]);
+  assert(dt == m->output_type[0]);
+  assert(dt == lora->inputs[0]->data_type);
+  assert(dt == lora->inputs[1]->data_type);
+  assert(dt == lora->outputs[0]->data_type);
+
+  // get layer name
+  assert(lora->name != nullptr && "Layer name is not set, cannot determine weights location");
+  std::string lora_layername = std::string(lora->name);
+  std::string searchString = "lora";
+  size_t found = lora_layername.find(searchString);
+  if (found == std::string::npos) {
+    std::cout << "LoraLinear layer name not in the right format (does not "
+                "contain word 'lora')"
+              << std::endl;
+    assert(false);
+  }
+  std::string lora_layername_substr =
+      lora_layername.substr(0, found + searchString.length());
+
+  for (const auto& kv : lora->peft_configs) {
+    PEFTModelID &model_id = kv.first;
+    LoraLinearConfig &lora_config = kv.second;
+    
+    int rank = lora_config.rank;
+    
+    int w0_num_elements = rank * in_dim;
+    int w1_num_elements = rank * out_dim;
+
+    LoraLinearWeight weight;
+    weight.in_dim = in_dim;
+    weight.out_dim = out_dim;
+    weight.rank = rank;
+    PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
+    weight.w0_ptr = allocator->allocate_local_weights_untyped(model_id, w0_num_elements * data_type_size(dt));
+    weight.w1_ptr = allocator->allocate_local_weights_untyped(model_id, w1_num_elements * data_type_size(dt));
+
+    // load weights from file
+    std::string weights_folder_filepath = join_path({
+        lora_config.cache_folder,
+        "weights",
+        lora_config.peft_model_id,
+        dt == DT_FLOAT ? "full-precision" : "half-precision",
+    });
+    std::string w0_filepath =
+        join_path({weights_folder_filepath, lora_layername_substr + "_A_weight"});
+    std::string w1_filepath =
+        join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"});
+    if (dt == DT_FLOAT) {
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
+                << ", size: " << w0_num_elements << ", shard: " << shard_id
+                << std::endl;
+      load_peft_from_file(
+          (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
+                << ", size: " << w1_num_elements << ", shard: " << shard_id
+                << std::endl;
+      load_peft_from_file(
+          (float *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath);
+    } else if (dt == DT_HALF) {
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
+                << ", size: " << w0_num_elements << ", shard: " << shard_id
+                << std::endl;
+      load_peft_from_file(
+          (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
+                << ", size: " << w1_num_elements << ", shard: " << shard_id
+                << std::endl;
+      load_peft_from_file(
+          (half *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath);
+    } else {
+      assert(false && "Data type not supported");
+    }
+
+    if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
+      // Input is partitioned (no replication)
+      // w0_grad is local weight gradients
+      weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(model_id, w0_num_elements * data_type_size(dt));
+      // w1_grad is sync weight gradients
+      weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(model_id, w1_num_elements * data_type_size(dt));
+    } else {
+      // Input is replicated
+      // w0_grad is sync weight gradients
+      weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(model_id, w0_num_elements * data_type_size(dt));
+      // w1_grad is local weight gradients
+      weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(model_id, w1_num_elements * data_type_size(dt));
+    }
+    assert(m->model_weights.find(model_id) == m->model_weights.end());
+    m->model_weights[model_id] = weight;
+  }
+
   return m;
 }
 
@@ -275,45 +372,6 @@ struct LoraLinearRegisterInfo {
   LoraLinearConfig lora_config;
 };
 
-void LoraLinear::register_peft_model(
-    FFModel const &ff,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    PEFTModelID const &model_id,
-    LoraLinearConfig const lora_config) {
-  assert(check_output_input_weight_same_parallel_is());
-  assert(batch_inputs.size() == 2);
-  assert(batch_outputs.size() == 1);
-  // Assert that the output and the second input are mapped to the same
-  // region/part
-  assert(batch_outputs[0]->region == batch_inputs[1]->region);
-  assert(batch_outputs[0]->part == batch_inputs[1]->part);
-  // assert(check_output_input_weight_same_machine_view());
-  // output is considered as an input to allow in-place optimization
-  ParallelTensor output_tensor = batch_outputs[0];
-  parallel_is = output_tensor->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  MachineView const *view = &output_tensor->machine_view;
-  size_t machine_view_hash = view->hash();
-  set_argumentmap_for_inference(ff, argmap, output_tensor);
-  LoraLinearRegisterInfo info;
-  info.lora = this;
-  info.model_id = model_id;
-  info.lora_config = lora_config;
-  IndexLauncher launcher(LORA_LINEAR_REG_TASK_ID,
-                         parallel_is,
-                         TaskArgument(&info, sizeof(LoraLinearRegisterInfo)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-}
-
 template <typename DT>
 void load_peft_from_file(
     DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) {
@@ -340,114 +398,6 @@ void load_peft_from_file(
   in.close();
 }
 
-void LoraLinear::register_model_task(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  LoraLinearRegisterInfo const *info =
-      static_cast<LoraLinearRegisterInfo const *>(task->args);
-  LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
-  LoraLinear const *lora = info->lora;
-
-  int shard_id = task->index_point.point_data[0];
-
-  int rank = info->lora_config.rank;
-  int num_dims = lora->inputs[0]->num_dims;
-  int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree;
-  int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree;
-  int w0_num_elements = rank * in_dim;
-  int w1_num_elements = rank * out_dim;
-
-  DataType dt = m->input_type[0];
-  assert(dt == m->input_type[1]);
-  assert(dt == m->output_type[0]);
-  assert(dt == lora->inputs[0]->data_type);
-  assert(dt == lora->inputs[1]->data_type);
-  assert(dt == lora->outputs[0]->data_type);
-  assert(m->model_weights.find(info->model_id) == m->model_weights.end());
-
-  LoraLinearWeight weight;
-  weight.in_dim = in_dim;
-  weight.out_dim = out_dim;
-  weight.rank = rank;
-  PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
-  weight.w0_ptr = allocator->allocate_local_weights_untyped(
-      info->model_id, w0_num_elements * data_type_size(dt));
-  weight.w1_ptr = allocator->allocate_local_weights_untyped(
-      info->model_id, w1_num_elements * data_type_size(dt));
-
-  // get layer name
-  assert(lora->name != nullptr &&
-         "Layer name is not set, cannot determine weights location");
-  std::string lora_layername = std::string(lora->name);
-  std::string searchString = "lora";
-  size_t found = lora_layername.find(searchString);
-  if (found == std::string::npos) {
-    std::cout << "LoraLinear layer name not in the right format (does not "
-                 "contain word 'lora')"
-              << std::endl;
-    assert(false);
-  }
-  std::string lora_layername_substr =
-      lora_layername.substr(0, found + searchString.length());
-
-  // load weights from file
-  std::string weights_folder_filepath = join_path({
-      info->lora_config.cache_folder,
-      "weights",
-      info->lora_config.peft_model_id,
-      dt == DT_FLOAT ? "full-precision" : "half-precision",
-  });
-  std::string w0_filepath =
-      join_path({weights_folder_filepath, lora_layername_substr + "_A_weight"});
-  std::string w1_filepath =
-      join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"});
-  if (dt == DT_FLOAT) {
-    std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
-              << ", size: " << w0_num_elements << ", shard: " << shard_id
-              << std::endl;
-    load_peft_from_file(
-        (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
-    std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
-              << ", size: " << w1_num_elements << ", shard: " << shard_id
-              << std::endl;
-    load_peft_from_file(
-        (float *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath);
-  } else if (dt == DT_HALF) {
-    std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
-              << ", size: " << w0_num_elements << ", shard: " << shard_id
-              << std::endl;
-    load_peft_from_file(
-        (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
-    std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
-              << ", size: " << w1_num_elements << ", shard: " << shard_id
-              << std::endl;
-    load_peft_from_file(
-        (half *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath);
-  } else {
-    assert(false && "Data type not supported");
-  }
-
-  if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
-    // Input is partitioned (no replication)
-    // w0_grad is local weight gradients
-    weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(
-        info->model_id, w0_num_elements * data_type_size(dt));
-    // w1_grad is sync weight gradients
-    weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(
-        info->model_id, w1_num_elements * data_type_size(dt));
-  } else {
-    // Input is replicated
-    // w0_grad is sync weight gradients
-    weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(
-        info->model_id, w0_num_elements * data_type_size(dt));
-    // w1_grad is local weight gradients
-    weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(
-        info->model_id, w1_num_elements * data_type_size(dt));
-  }
-  m->model_weights[info->model_id] = weight;
-}
-
 void LoraLinear::forward(FFModel const &ff) {
   assert(false && "LoraLinear does not support normal init");
 }
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 92340a92db..ed5581ddd1 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6697,22 +6697,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
-  {
-    TaskVariantRegistrar registrar(LORA_LINEAR_REG_TASK_ID,
-                                   "LoraLinear Model Registration");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    if (pre_register) {
-      Runtime::preregister_task_variant<LoraLinear::register_model_task>(
-          registrar, "LoraLinear Model Registration Task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<LoraLinear::register_model_task>(
-          registrar);
-    }
-  }
   {
     TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID,
                                    "LoraLinear Inference");
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 3d71fa1e6b..e9e21df52a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2495,97 +2495,6 @@ bool is_peft_operator_type(OperatorType type) {
   }
 }
 
-PEFTModelID FFModel::register_peft_model(LoraLinearConfig const peft_config) {
-  if (peft_config == LoraLinearConfig::EmptyConfig) {
-    fprintf(stderr, "Error: trying to register empty PEFT model\n");
-    assert(false);
-  }
-  if (!config.enable_peft) {
-    fprintf(stderr,
-            "Error: trying to register PEFT model, but peft mode is not "
-            "enabled.\n");
-    assert(false);
-  }
-  
-  PEFTModelID peft_model_id(peft_model_global_guid++);
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  std::vector<Op *> peft_operators;
-  for (size_t op = 0; op < operators.size(); op++) {
-    if (is_peft_operator_type(operators[op]->op_type)) {
-      peft_operators.push_back(operators[op]);
-    } else if (operators[op]->op_type == OP_FUSED) {
-      FusedOp *fused = static_cast<FusedOp *>(operators[op]);
-      for (size_t op2 = 0; op2 < fused->numOperators; op2++) {
-        if (is_peft_operator_type(fused->operators[op2]->op_type)) {
-          peft_operators.push_back(fused->operators[op2]);
-        }
-      }
-    }
-  }
-  for (size_t op = 0; op < peft_operators.size(); op++) {
-    std::string layer_name =
-        find_layer_name_from_guid(this, peft_operators[op]->layer_guid);
-    switch (peft_operators[op]->op_type) {
-      case OP_LORA: {
-        if (mlp_first == LoraLinearConfig::DefaultConfig) {
-          // Do nothing for the default configuration
-          continue;
-        }
-        LoraLinear *lora = static_cast<LoraLinear *>(peft_operators[op]);
-        // Currently assume only a single data pipeline
-        assert(config.data_parallelism_degree == 1);
-        std::vector<ParallelTensor> inputs(lora->numInputs);
-        std::vector<ParallelTensor> outputs(lora->numOutputs);
-
-        for (int i = 0; i < lora->numInputs; i++) {
-          assert(im->tensor_buffer.find(lora->inputs[i]) !=
-                 im->tensor_buffer.end());
-          assert(lora->inputs[i] != nullptr);
-          assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
-          assert(im->tensor_buffer[lora->inputs[i]].size() == 1);
-          inputs[i] = im->tensor_buffer[lora->inputs[i]][0];
-          assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
-        }
-        assert(lora->numOutputs == 1);
-        outputs[0] = inputs[1];
-        lora->register_peft_model(
-            *this, inputs, outputs, peft_model_id, mlp_first);
-        break;
-      }
-      case OP_LORA_MLP_SECOND: {
-        if (mlp_second == LoraLinearConfig::DefaultConfig) {
-          // Do nothing for the default configuration
-          continue;
-        }
-        LoraLinear *lora = static_cast<LoraLinear *>(peft_operators[op]);
-        // Currently assume only a single data pipeline
-        assert(config.data_parallelism_degree == 1);
-        std::vector<ParallelTensor> inputs(lora->numInputs);
-        std::vector<ParallelTensor> outputs(lora->numOutputs);
-
-        for (int i = 0; i < lora->numInputs; i++) {
-          assert(im->tensor_buffer.find(lora->inputs[i]) !=
-                 im->tensor_buffer.end());
-          assert(lora->inputs[i] != nullptr);
-          assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
-          assert(im->tensor_buffer[lora->inputs[i]].size() == 1);
-          inputs[i] = im->tensor_buffer[lora->inputs[i]][0];
-          assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
-        }
-        assert(lora->numOutputs == 1);
-        outputs[0] = inputs[1];
-        lora->register_peft_model(
-            *this, inputs, outputs, peft_model_id, mlp_second);
-        break;
-      }
-      default: {
-        assert(false && "Unsupported PEFT Operator type");
-      }
-    }
-  }
-  return peft_model_id;
-}
-
 /*static*/
 void RequestManager::serve_incr_decoding(FFModel *llm) {
   Context ctx = llm->config.lg_ctx;

From b6539aaa6f5d50ff7f55c78d4abceb0a4af0d129 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 23 Feb 2024 23:17:26 +0000
Subject: [PATCH 10/32] fixes

---
 include/flexflow/model.h      |   2 +-
 src/ops/lora_linear.cc        | 185 +++++++++++++++++-----------------
 src/ops/lora_linear_params.cc |   6 +-
 3 files changed, 96 insertions(+), 97 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index cae888784c..36aaec30bc 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -837,7 +837,7 @@ class FFModel {
   // ========================================
   // PEFT Layers
   // ========================================
-  PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config);
+  PEFTModelID add_lora_layer(LoraLinearConfig const peft_config);
   // ========================================
   // Inference APIs
   // ========================================
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 2a9f83e11d..3f8bdb98ba 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -40,64 +40,70 @@ using namespace FlexFlow::Kernels::LoraLinear;
 
 PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
   assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled");
-  assert(target_module_name.length() > 0 && "LoRA target module name is empty");
-
-  // find target layer, and ensure uniqueness.
-  // if the target layer already has a LoRA layer, no need to add it again (keep track of layers with lora)
-  Layer *target_module = nullptr;
-  int idx;
-  for (Layer *it : layers) {
-    if (it->op_type == OP_LINEAR && it->name != nullptr && strlen(it->name) > 0) {
-      std::string s(it->name);
-      if (s.find(target_module_name) != string::npos) {
-        // Check that this is the only layer with target name
-        if (target_module != nullptr) {
-          fprintf(stderr, "Error, found two layers containing LoRA target module name '%s'. Layer 1: %s, Layer 2: %s\n",
-          target_module_name.c_str(), target_module->name, it->name);
-          assert(false);
-        }
-        target_module = it;
-      }
-    }
-    idx++;
+  if (peft_config.target_modules.size() == 0) {
+    printf("PEFT config does not contain any target module\n");
+    return PEFTModelID::NO_ID;
   }
   PEFTModelID peft_model_id(peft_model_global_guid++);
   peft_configs[peft_model_id] = peft_config;
 
-  Layer *peft_layer = nullptr;
-  if (base_layer_to_peft_layer.find(target_module) != base_layer_to_peft_layer.end()) {
-    // lora linear layer already added, no need to add again
-    peft_layer = base_layer_to_peft_layer[target_module];
-    peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
-  } else {
-    Tensor const input = target_module->inputs[0];
-    Tensor const output = target_module->outputs[0];
-    assert(input->data_type == output->data_type);
-    std::string name_ = target_module->name + ".lora";
-    Layer *peft_layer = new Layer(this,
-                        OP_LORA,
-                        output->data_type,
-                        name.c_str(),
-                        2 /*inputs*/,
-                        0 /*weights*/,
-                        1 /*outputs*/,
-                        input,
-                        output);
-    {
-      int numdims = output->num_dims;
-      int dims[MAX_TENSOR_DIM];
-      for (int i = 0; i < numdims; i++) {
-        dims[i] = output->dims[i];
+  for (std::string target_module_name : peft_config.target_modules) {
+    assert(target_module_name.length() > 0 && "LoRA target module name is empty");
+    // find target layer, and ensure uniqueness.
+    // if the target layer already has a LoRA layer, no need to add it again (keep track of layers with lora)
+    Layer *target_module = nullptr;
+    int idx;
+    for (Layer *it : layers) {
+      if (it->op_type == OP_LINEAR && it->name != nullptr && strlen(it->name) > 0) {
+        std::string s(it->name);
+        if (s.find(target_module_name) != std::string::npos) {
+          // Check that this is the only layer with target name
+          if (target_module != nullptr) {
+            fprintf(stderr, "Error, found two layers containing LoRA target module name '%s'. Layer 1: %s, Layer 2: %s\n",
+            target_module_name.c_str(), target_module->name, it->name);
+            assert(false);
+          }
+          target_module = it;
+        }
       }
-      peft_layer->outputs[0] = create_tensor_legion_ordering(
-          numdims, dims, output->data_type, peft_layer, 0, true /*create_grad*/);
+      idx++;
+    }
+    Layer *peft_layer = nullptr;
+    if (base_layer_to_peft_layer.find(target_module) != base_layer_to_peft_layer.end()) {
+      // lora linear layer already added, no need to add again
+      peft_layer = base_layer_to_peft_layer[target_module];
+      peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
+    } else {
+      Tensor const input = target_module->inputs[0];
+      Tensor const output = target_module->outputs[0];
+      assert(input->data_type == output->data_type);
+      std::string name_ = target_module->name ? std::string(target_module->name) : std::string("");
+      name_ += ".lora";
+      Layer *peft_layer = new Layer(this,
+                          OP_LORA,
+                          output->data_type,
+                          name_.c_str(),
+                          2 /*inputs*/,
+                          0 /*weights*/,
+                          1 /*outputs*/,
+                          input,
+                          output);
+      {
+        int numdims = output->num_dims;
+        int dims[MAX_TENSOR_DIM];
+        for (int i = 0; i < numdims; i++) {
+          dims[i] = output->dims[i];
+        }
+        peft_layer->outputs[0] = create_tensor_legion_ordering(
+            numdims, dims, output->data_type, peft_layer, 0, true /*create_grad*/);
+      }
+      layers.insert(layers.begin() + idx + 1, peft_layer);
+      base_layer_to_peft_layer[target_module] = peft_layer;
+      peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
+      peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
     }
-    layers.insert(layers.begin() + idx + 1, peft_layer);
-    
-    base_layer_to_peft_layer[target_module] = peft_layer;
-    peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
-    peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
   }
+  
   return peft_model_id;
 }
 
@@ -105,8 +111,8 @@ Op *LoraLinear::create_operator_from_layer(
     FFModel &model,
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
-  std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs,
-  std::vector<PEFTModelID> const &peft_ids = model.peft_layer_to_peft_id[layer];
+  std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs;
+  std::vector<PEFTModelID> const &peft_ids = model.peft_layer_to_peft_id[(Layer*)layer];
   for (int i=0; i<peft_ids.size(); i++) {
     _peft_configs.emplace(std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]]));
   }
@@ -143,7 +149,7 @@ LoraLinear::LoraLinear(FFModel &model,
                        OperatorType _op_type,
                        ParallelTensor const _input,
                        ParallelTensor const _output,
-                       std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs,
+                       std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
                        char const *name)
     : Op(model,
          _op_type,
@@ -230,6 +236,32 @@ void LoraLinear::init_inference(
   set_opmeta_from_futuremap_inference(ff, fm, output_tensor);
 }
 
+template <typename DT>
+void load_peft_from_file(
+    DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) {
+  std::ifstream in(filepath, std::ios::in | std::ios::binary);
+  if (!in.good()) {
+    printf("Could not open file: %s\n", filepath.c_str());
+  }
+  assert(in.good() && "incorrect weight file path");
+  std::vector<DT> host_array(size);
+  size_t target_data_size = sizeof(DT) * size;
+  in.seekg(sharded * shard_id * target_data_size, in.beg);
+  in.read((char *)host_array.data(), target_data_size);
+
+  size_t in_get_size = in.gcount();
+  if (in_get_size != target_data_size) {
+    printf("load weight data error: %lu, %lu, %lu\n",
+           in_get_size,
+           target_data_size,
+           sizeof(DT));
+    assert(false);
+  }
+  assert(size == host_array.size());
+  copy_tensor_host_to_dev(ptr, host_array.data(), size);
+  in.close();
+}
+
 /*
   regions[0](O): output
   regions[1](I): kernel
@@ -268,8 +300,8 @@ OpMeta *LoraLinear::init_task(Task const *task,
 
   int shard_id = task->index_point.point_data[0];
   int num_dims = lora->inputs[0]->num_dims;
-  int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree;
-  int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree;
+  assert(in_dim == lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree);
+  assert(out_dim == lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree);
 
   DataType dt = m->input_type[0];
   assert(dt == m->input_type[1]);
@@ -293,8 +325,8 @@ OpMeta *LoraLinear::init_task(Task const *task,
       lora_layername.substr(0, found + searchString.length());
 
   for (const auto& kv : lora->peft_configs) {
-    PEFTModelID &model_id = kv.first;
-    LoraLinearConfig &lora_config = kv.second;
+    PEFTModelID const &model_id = kv.first;
+    LoraLinearConfig const &lora_config = kv.second;
     
     int rank = lora_config.rank;
     
@@ -311,8 +343,7 @@ OpMeta *LoraLinear::init_task(Task const *task,
 
     // load weights from file
     std::string weights_folder_filepath = join_path({
-        lora_config.cache_folder,
-        "weights",
+        lora_config.config_folder,
         lora_config.peft_model_id,
         dt == DT_FLOAT ? "full-precision" : "half-precision",
     });
@@ -366,38 +397,6 @@ OpMeta *LoraLinear::init_task(Task const *task,
   return m;
 }
 
-struct LoraLinearRegisterInfo {
-  LoraLinear const *lora;
-  PEFTModelID model_id;
-  LoraLinearConfig lora_config;
-};
-
-template <typename DT>
-void load_peft_from_file(
-    DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) {
-  std::ifstream in(filepath, std::ios::in | std::ios::binary);
-  if (!in.good()) {
-    printf("Could not open file: %s\n", filepath.c_str());
-  }
-  assert(in.good() && "incorrect weight file path");
-  std::vector<DT> host_array(size);
-  size_t target_data_size = sizeof(DT) * size;
-  in.seekg(sharded * shard_id * target_data_size, in.beg);
-  in.read((char *)host_array.data(), target_data_size);
-
-  size_t in_get_size = in.gcount();
-  if (in_get_size != target_data_size) {
-    printf("load weight data error: %lu, %lu, %lu\n",
-           in_get_size,
-           target_data_size,
-           sizeof(DT));
-    assert(false);
-  }
-  assert(size == host_array.size());
-  copy_tensor_host_to_dev(ptr, host_array.data(), size);
-  in.close();
-}
-
 void LoraLinear::forward(FFModel const &ff) {
   assert(false && "LoraLinear does not support normal init");
 }
@@ -761,7 +760,7 @@ bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
   if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && lhs.peft_configs.size() == rhs.peft_configs.size()) {
     for (const auto& kv : lhs.peft_configs) {
       auto it = rhs.peft_configs.find(kv.first);
-      if (it == rhs.peft_configs.end() || it->second != kv.second) {
+      if (it == rhs.peft_configs.end() || !(it->second == kv.second)) {
         return false;
       }
     }
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 0edeb03d2f..595743ac33 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -75,9 +75,9 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   os << "lora_alpha: " << llc.lora_alpha << ", ";
   os << "lora_dropout: " << llc.lora_dropout << ", ";
   os << "target_modules: [";
-  for (int i=0; i<target_modules.size(); i++) {
-    os << target_modules[i];
-    if (i < target_modules.size() - 1) {
+  for (int i=0; i<llc.target_modules.size(); i++) {
+    os << llc.target_modules[i];
+    if (i < llc.target_modules.size() - 1) {
       os << ", ";
     }
   }

From fd4227307db1030062a25544359b07b3ccd5cabf Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 23 Feb 2024 23:25:14 +0000
Subject: [PATCH 11/32] fix

---
 src/runtime/model.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index ed5581ddd1..63016d0c8b 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3311,7 +3311,6 @@ Op *FFModel::create_operator_from_layer(
     case OP_LORA: {
       Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs);
       operators.push_back(op);
-      peft_operators.push_back(op);
       return op;
     }
     default:

From 484a3cb2a08caa91c8ac110b75f92f92c91a11a9 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 3 Mar 2024 02:04:12 +0000
Subject: [PATCH 12/32] fix build

---
 include/flexflow/model.h                 |   8 +-
 include/flexflow/ops/lora_linear.h       |  15 +-
 inference/incr_decoding/incr_decoding.cc |   8 +-
 inference/models/llama.cc                |   3 +-
 inference/models/opt.cc                  |   3 +-
 src/ops/lora_linear.cc                   | 241 ++++++++++++++++-------
 src/ops/lora_linear_params.cc            |  15 +-
 7 files changed, 203 insertions(+), 90 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 36aaec30bc..74421ffc92 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -1168,11 +1168,11 @@ class FFModel {
   std::vector<Op *> operators;
   std::vector<ParallelTensor> parameters;
   // PEFT related
-  std::unordered_map<Layer*, Layer*> base_layer_to_peft_layer;
-  std::unordered_map<Layer*, std::vector<PEFTModelID>> peft_layer_to_peft_id;
+  std::unordered_map<Layer *, Layer *> base_layer_to_peft_layer;
+  std::unordered_map<Layer *, std::vector<PEFTModelID>> peft_layer_to_peft_id;
   std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
-//   std::vector<Op *> peft_operators;
-  
+  //   std::vector<Op *> peft_operators;
+
   FFHandler handlers[MAX_NUM_WORKERS];
   Legion::Future current_metrics;
   // Cached operators: key: operator hash, value: operator pointer
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index 579d6f06a8..9e83c3f90e 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -17,13 +17,14 @@ class LoraLinear : public Op {
   using Params = LoraLinearParams;
   using Input = std::pair<ParallelTensor, ParallelTensor>;
 
-  LoraLinear(FFModel &model,
-             LayerID const &layer_guid,
-             OperatorType type,
-             ParallelTensor const input,
-             ParallelTensor const output,
-             std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
-             char const *name = nullptr);
+  LoraLinear(
+      FFModel &model,
+      LayerID const &layer_guid,
+      OperatorType type,
+      ParallelTensor const input,
+      ParallelTensor const output,
+      std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+      char const *name = nullptr);
   LoraLinear(FFModel &model,
              LoraLinear const &other,
              ParallelTensor const input,
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 6d1af3c17c..e7d4cf16fb 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -160,6 +160,7 @@ void FlexFlow::top_level_task(Task const *task,
                    use_full_precision,
                    verbose,
                    do_sample,
+                   enable_peft,
                    temperature,
                    topp,
                    max_requests_per_batch,
@@ -191,7 +192,7 @@ void FlexFlow::top_level_task(Task const *task,
     std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
     assert(false);
   }
-  
+
   json model_config = json::parse(config_file_handle,
                                   /*parser_callback_t */ nullptr,
                                   /*allow_exceptions */ true,
@@ -227,7 +228,10 @@ void FlexFlow::top_level_task(Task const *task,
          "Invalid LLM model type passed (or no type was passed).");
 
   // load PEFT config
-  LoraLinearConfig peft_config = peft_model_name.empty() ? LoraLinearConfig::EmptyConfig : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
 
   GenerationConfig generationConfig(do_sample, temperature, topp);
   RequestManager *rm = RequestManager::get_request_manager();
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index fd788fa904..4be232e81b 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -224,7 +224,8 @@ void LLAMA::create_llama_model(FFModel &ff,
         0.0f,
         std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
-    // ff.lora_linear(std::string("down_proj"), std::string("layers." + std::to_string(i) + ".mlp.down_proj.lora").c_str());
+    // ff.lora_linear(std::string("down_proj"), std::string("layers." +
+    // std::to_string(i) + ".mlp.down_proj.lora").c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index bc22e1a8b7..b3f2ef4e17 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -220,7 +220,8 @@ void OPT::create_opt_model(FFModel &ff,
                    0.0f,
                    std::string("layers." + std::to_string(i) + ".fc2").c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
-    // ff.lora_linear(std::string("fc2"), std::string("layers." + std::to_string(i) + ".fc2.lora").c_str());
+    // ff.lora_linear(std::string("fc2"), std::string("layers." +
+    // std::to_string(i) + ".fc2.lora").c_str());
   }
 
   // final
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 3f8bdb98ba..8a54709df6 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -39,7 +39,8 @@ using Legion::TaskLauncher;
 using namespace FlexFlow::Kernels::LoraLinear;
 
 PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
-  assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled");
+  assert(config.enable_peft &&
+         "Cannot add a LoRA layer if PEFT mode is not enabled");
   if (peft_config.target_modules.size() == 0) {
     printf("PEFT config does not contain any target module\n");
     return PEFTModelID::NO_ID;
@@ -48,19 +49,26 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
   peft_configs[peft_model_id] = peft_config;
 
   for (std::string target_module_name : peft_config.target_modules) {
-    assert(target_module_name.length() > 0 && "LoRA target module name is empty");
+    assert(target_module_name.length() > 0 &&
+           "LoRA target module name is empty");
     // find target layer, and ensure uniqueness.
-    // if the target layer already has a LoRA layer, no need to add it again (keep track of layers with lora)
+    // if the target layer already has a LoRA layer, no need to add it again
+    // (keep track of layers with lora)
     Layer *target_module = nullptr;
     int idx;
     for (Layer *it : layers) {
-      if (it->op_type == OP_LINEAR && it->name != nullptr && strlen(it->name) > 0) {
+      if (it->op_type == OP_LINEAR && it->name != nullptr &&
+          strlen(it->name) > 0) {
         std::string s(it->name);
         if (s.find(target_module_name) != std::string::npos) {
           // Check that this is the only layer with target name
           if (target_module != nullptr) {
-            fprintf(stderr, "Error, found two layers containing LoRA target module name '%s'. Layer 1: %s, Layer 2: %s\n",
-            target_module_name.c_str(), target_module->name, it->name);
+            fprintf(stderr,
+                    "Error, found two layers containing LoRA target module "
+                    "name '%s'. Layer 1: %s, Layer 2: %s\n",
+                    target_module_name.c_str(),
+                    target_module->name,
+                    it->name);
             assert(false);
           }
           target_module = it;
@@ -69,7 +77,8 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
       idx++;
     }
     Layer *peft_layer = nullptr;
-    if (base_layer_to_peft_layer.find(target_module) != base_layer_to_peft_layer.end()) {
+    if (base_layer_to_peft_layer.find(target_module) !=
+        base_layer_to_peft_layer.end()) {
       // lora linear layer already added, no need to add again
       peft_layer = base_layer_to_peft_layer[target_module];
       peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
@@ -77,25 +86,31 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
       Tensor const input = target_module->inputs[0];
       Tensor const output = target_module->outputs[0];
       assert(input->data_type == output->data_type);
-      std::string name_ = target_module->name ? std::string(target_module->name) : std::string("");
+      std::string name_ = target_module->name ? std::string(target_module->name)
+                                              : std::string("");
       name_ += ".lora";
       Layer *peft_layer = new Layer(this,
-                          OP_LORA,
-                          output->data_type,
-                          name_.c_str(),
-                          2 /*inputs*/,
-                          0 /*weights*/,
-                          1 /*outputs*/,
-                          input,
-                          output);
+                                    OP_LORA,
+                                    output->data_type,
+                                    name_.c_str(),
+                                    2 /*inputs*/,
+                                    0 /*weights*/,
+                                    1 /*outputs*/,
+                                    input,
+                                    output);
       {
         int numdims = output->num_dims;
         int dims[MAX_TENSOR_DIM];
         for (int i = 0; i < numdims; i++) {
           dims[i] = output->dims[i];
         }
-        peft_layer->outputs[0] = create_tensor_legion_ordering(
-            numdims, dims, output->data_type, peft_layer, 0, true /*create_grad*/);
+        peft_layer->outputs[0] =
+            create_tensor_legion_ordering(numdims,
+                                          dims,
+                                          output->data_type,
+                                          peft_layer,
+                                          0,
+                                          true /*create_grad*/);
       }
       layers.insert(layers.begin() + idx + 1, peft_layer);
       base_layer_to_peft_layer[target_module] = peft_layer;
@@ -103,7 +118,7 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
       peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
     }
   }
-  
+
   return peft_model_id;
 }
 
@@ -112,9 +127,11 @@ Op *LoraLinear::create_operator_from_layer(
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
   std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs;
-  std::vector<PEFTModelID> const &peft_ids = model.peft_layer_to_peft_id[(Layer*)layer];
-  for (int i=0; i<peft_ids.size(); i++) {
-    _peft_configs.emplace(std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]]));
+  std::vector<PEFTModelID> const &peft_ids =
+      model.peft_layer_to_peft_id[(Layer *)layer];
+  for (int i = 0; i < peft_ids.size(); i++) {
+    _peft_configs.emplace(
+        std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]]));
   }
   return new LoraLinear(model,
                         layer->layer_guid,
@@ -122,15 +139,21 @@ Op *LoraLinear::create_operator_from_layer(
                         inputs[0],
                         inputs[1],
                         _peft_configs,
-                        layer->name);;
+                        layer->name);
+  ;
 }
 
 LoraLinear::LoraLinear(FFModel &model,
                        LoraLinear const &other,
                        ParallelTensor const input,
                        ParallelTensor const output)
-    : LoraLinear(
-          model, other.layer_guid, other.op_type, input, output, other.peft_configs, other.name) {}
+    : LoraLinear(model,
+                 other.layer_guid,
+                 other.op_type,
+                 input,
+                 output,
+                 other.peft_configs,
+                 other.name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
                        Params const &params,
@@ -144,13 +167,14 @@ LoraLinear::LoraLinear(FFModel &model,
                  params.peft_configs,
                  params.name) {}
 
-LoraLinear::LoraLinear(FFModel &model,
-                       LayerID const &_layer_guid,
-                       OperatorType _op_type,
-                       ParallelTensor const _input,
-                       ParallelTensor const _output,
-                       std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
-                       char const *name)
+LoraLinear::LoraLinear(
+    FFModel &model,
+    LayerID const &_layer_guid,
+    OperatorType _op_type,
+    ParallelTensor const _input,
+    ParallelTensor const _output,
+    std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+    char const *name)
     : Op(model,
          _op_type,
          _output->data_type,
@@ -179,8 +203,8 @@ LoraLinear::LoraLinear(FFModel &model,
     outputs[0] = model.create_parallel_tensor_legion_ordering(
         numdim, dims, inputs[1]->data_type, this);
   }
-  for (const auto& kv : _peft_configs) {
-    peft_configs.insert(kv); 
+  for (auto const &kv : _peft_configs) {
+    peft_configs.insert(kv);
   }
   // assert(check_output_input_weight_parallel_dims(allocate_weights));
 }
@@ -300,8 +324,10 @@ OpMeta *LoraLinear::init_task(Task const *task,
 
   int shard_id = task->index_point.point_data[0];
   int num_dims = lora->inputs[0]->num_dims;
-  assert(in_dim == lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree);
-  assert(out_dim == lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree);
+  assert(in_dim ==
+         lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree);
+  assert(out_dim ==
+         lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree);
 
   DataType dt = m->input_type[0];
   assert(dt == m->input_type[1]);
@@ -311,25 +337,26 @@ OpMeta *LoraLinear::init_task(Task const *task,
   assert(dt == lora->outputs[0]->data_type);
 
   // get layer name
-  assert(lora->name != nullptr && "Layer name is not set, cannot determine weights location");
+  assert(lora->name != nullptr &&
+         "Layer name is not set, cannot determine weights location");
   std::string lora_layername = std::string(lora->name);
   std::string searchString = "lora";
   size_t found = lora_layername.find(searchString);
   if (found == std::string::npos) {
     std::cout << "LoraLinear layer name not in the right format (does not "
-                "contain word 'lora')"
+                 "contain word 'lora')"
               << std::endl;
     assert(false);
   }
   std::string lora_layername_substr =
       lora_layername.substr(0, found + searchString.length());
 
-  for (const auto& kv : lora->peft_configs) {
+  for (auto const &kv : lora->peft_configs) {
     PEFTModelID const &model_id = kv.first;
     LoraLinearConfig const &lora_config = kv.second;
-    
+
     int rank = lora_config.rank;
-    
+
     int w0_num_elements = rank * in_dim;
     int w1_num_elements = rank * out_dim;
 
@@ -338,8 +365,10 @@ OpMeta *LoraLinear::init_task(Task const *task,
     weight.out_dim = out_dim;
     weight.rank = rank;
     PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
-    weight.w0_ptr = allocator->allocate_local_weights_untyped(model_id, w0_num_elements * data_type_size(dt));
-    weight.w1_ptr = allocator->allocate_local_weights_untyped(model_id, w1_num_elements * data_type_size(dt));
+    weight.w0_ptr = allocator->allocate_local_weights_untyped(
+        model_id, w0_num_elements * data_type_size(dt));
+    weight.w1_ptr = allocator->allocate_local_weights_untyped(
+        model_id, w1_num_elements * data_type_size(dt));
 
     // load weights from file
     std::string weights_folder_filepath = join_path({
@@ -347,10 +376,10 @@ OpMeta *LoraLinear::init_task(Task const *task,
         lora_config.peft_model_id,
         dt == DT_FLOAT ? "full-precision" : "half-precision",
     });
-    std::string w0_filepath =
-        join_path({weights_folder_filepath, lora_layername_substr + "_A_weight"});
-    std::string w1_filepath =
-        join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"});
+    std::string w0_filepath = join_path(
+        {weights_folder_filepath, lora_layername_substr + "_A_weight"});
+    std::string w1_filepath = join_path(
+        {weights_folder_filepath, lora_layername_substr + "_B_weight"});
     if (dt == DT_FLOAT) {
       std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
                 << ", size: " << w0_num_elements << ", shard: " << shard_id
@@ -360,8 +389,11 @@ OpMeta *LoraLinear::init_task(Task const *task,
       std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
                 << ", size: " << w1_num_elements << ", shard: " << shard_id
                 << std::endl;
-      load_peft_from_file(
-          (float *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath);
+      load_peft_from_file((float *)weight.w1_ptr,
+                          w1_num_elements,
+                          false,
+                          shard_id,
+                          w1_filepath);
     } else if (dt == DT_HALF) {
       std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
                 << ", size: " << w0_num_elements << ", shard: " << shard_id
@@ -380,15 +412,19 @@ OpMeta *LoraLinear::init_task(Task const *task,
     if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
       // Input is partitioned (no replication)
       // w0_grad is local weight gradients
-      weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(model_id, w0_num_elements * data_type_size(dt));
+      weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(
+          model_id, w0_num_elements * data_type_size(dt));
       // w1_grad is sync weight gradients
-      weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(model_id, w1_num_elements * data_type_size(dt));
+      weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(
+          model_id, w1_num_elements * data_type_size(dt));
     } else {
       // Input is replicated
       // w0_grad is sync weight gradients
-      weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(model_id, w0_num_elements * data_type_size(dt));
+      weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(
+          model_id, w0_num_elements * data_type_size(dt));
       // w1_grad is local weight gradients
-      weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(model_id, w1_num_elements * data_type_size(dt));
+      weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(
+          model_id, w1_num_elements * data_type_size(dt));
     }
     assert(m->model_weights.find(model_id) == m->model_weights.end());
     m->model_weights[model_id] = weight;
@@ -757,8 +793,9 @@ bool LoraLinear::measure_operator_cost(Simulator *sim,
 }
 
 bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
-  if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && lhs.peft_configs.size() == rhs.peft_configs.size()) {
-    for (const auto& kv : lhs.peft_configs) {
+  if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type &&
+      lhs.peft_configs.size() == rhs.peft_configs.size()) {
+    for (auto const &kv : lhs.peft_configs) {
       auto it = rhs.peft_configs.find(kv.first);
       if (it == rhs.peft_configs.end() || !(it->second == kv.second)) {
         return false;
@@ -775,9 +812,28 @@ void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->op_type);
   sez.serialize(this->peft_configs.size());
-  for (const auto& kv : this->peft_configs) {
-    sez.serialize(kv.first);
-    sez.serialize(kv.second);
+  for (auto const &kv : this->peft_configs) {
+    // Serialize PEFTModelID
+    sez.serialize(kv.first.id);
+    // Serialize LoraLinearConfig
+    sez.serialize(kv.second.rank);
+    sez.serialize(kv.second.optimizer_type);
+    sez.serialize(kv.second.learning_rate);
+    sez.serialize(kv.second.config_folder.length());
+    sez.serialize(kv.second.config_folder.c_str(),
+                  kv.second.config_folder.length());
+    sez.serialize(kv.second.peft_model_id.length());
+    sez.serialize(kv.second.peft_model_id.c_str(),
+                  kv.second.peft_model_id.length());
+    sez.serialize(kv.second.lora_alpha);
+    sez.serialize(kv.second.lora_dropout);
+    sez.serialize(kv.second.target_modules.size());
+    sez.serialize(kv.second.load_weights_from_file);
+    for (int i = 0; i < kv.second.target_modules.size(); i++) {
+      sez.serialize(kv.second.target_modules[i].length());
+      sez.serialize(kv.second.target_modules[i].c_str(),
+                    kv.second.target_modules[i].length());
+    }
   }
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
@@ -793,22 +849,61 @@ Node LoraLinear::deserialize(FFModel &ff,
   size_t id, transformer_layer_id, deserialized_model_id;
   OperatorType op_type;
   size_t num_pefts;
-  PEFTModelID peft_model_id;
-  LoraLinearConfig peft_config;
   size_t name_len;
   char name[MAX_OPNAME] = {0};
-  
+
   LoraLinearParams params;
-  
+
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
   dez.deserialize(op_type);
   dez.deserialize(num_pefts);
-  for (int i=0; i<num_pefts; i++) {
-    dez.deserialize(peft_model_id);
-    dez.deserialize(peft_config);
-    params.peft_configs.emplace(std::make_pair(peft_model_id, peft_config));
+  for (int i = 0; i < num_pefts; i++) {
+    // Deserialize PEFTModelID
+    size_t pid;
+    dez.deserialize(pid);
+    PEFTModelID peft_model_id(pid);
+    // Deserialize LoraLinearConfig
+    int rank;
+    OptimizerType optimizer_type;
+    float learning_rate;
+    dez.deserialize(rank);
+    dez.deserialize(optimizer_type);
+    dez.deserialize(learning_rate);
+    LoraLinearConfig lora_linear_config(rank, optimizer_type, learning_rate);
+    size_t string_size;
+    char buffer[4096] = {0};
+    // deserialize config_folder
+    dez.deserialize(string_size);
+    dez.deserialize(buffer, string_size);
+    lora_linear_config.config_folder = std::string(buffer);
+    string_size = 0;
+    memset(buffer, 0, 4096);
+    // deserialize peft_model_id
+    dez.deserialize(string_size);
+    dez.deserialize(buffer, string_size);
+    lora_linear_config.peft_model_id = std::string(buffer);
+    string_size = 0;
+    memset(buffer, 0, 4096);
+    // deserialize lora_alpha and lora_dropout
+    dez.deserialize(lora_linear_config.lora_alpha);
+    dez.deserialize(lora_linear_config.lora_dropout);
+    // deserialize target_modules
+    size_t num_target_modules = 0;
+    dez.deserialize(num_target_modules);
+    for (int i = 0; i < num_target_modules; i++) {
+      dez.deserialize(string_size);
+      dez.deserialize(buffer, string_size);
+      lora_linear_config.target_modules.push_back(std::string(buffer));
+      string_size = 0;
+      memset(buffer, 0, 4096);
+    }
+    // deserialize load_weights_from_file
+    dez.deserialize(lora_linear_config.load_weights_from_file);
+    // Append entry to list
+    params.peft_configs.emplace(
+        std::make_pair(peft_model_id, lora_linear_config));
   }
   dez.deserialize(name_len);
   dez.deserialize(name, name_len);
@@ -853,9 +948,17 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.layer_guid.transformer_layer_id);
   hash_combine(key, params.layer_guid.model_id);
-  for (const auto& kv : params.peft_configs) {
-    hash_combine(key, kv.first);
-    hash_combine(key, kv.second);
+  for (auto const &kv : params.peft_configs) {
+    hash_combine(key, kv.first.id);
+    hash_combine(key, kv.second.rank);
+    hash_combine(key, kv.second.optimizer_type);
+    hash_combine(key, kv.second.learning_rate);
+    hash_combine(key, kv.second.config_folder);
+    hash_combine(key, kv.second.peft_model_id);
+    hash_combine(key, kv.second.lora_alpha);
+    hash_combine(key, kv.second.lora_dropout);
+    hash_combine(key, kv.second.target_modules);
+    hash_combine(key, kv.second.load_weights_from_file);
   }
   return key;
 }
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 595743ac33..771cf94906 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -32,7 +32,7 @@ LoraLinearConfig::LoraLinearConfig(std::string const &config_folder_,
       lora_alpha = model_config["lora_alpha"];
       lora_dropout = model_config["lora_dropout"];
       for (auto &s : model_config["target_modules"]) {
-        target_modules.push_back(s); 
+        target_modules.push_back(s);
       }
     } catch (json::exception const &e) {
       std::cerr << "Error parsing PEFT config from JSON file: " << e.what()
@@ -51,11 +51,14 @@ LoraLinearConfig::LoraLinearConfig(std::string const &config_folder_,
 
 bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) {
   if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type &&
-      lhs.learning_rate == rhs.learning_rate && lhs.config_folder == rhs.config_folder &&
-      lhs.peft_model_id == rhs.peft_model_id && lhs.lora_alpha == rhs.lora_alpha &&
-      lhs.lora_dropout == rhs.lora_dropout && lhs.target_modules.size() == rhs.target_modules.size() &&
+      lhs.learning_rate == rhs.learning_rate &&
+      lhs.config_folder == rhs.config_folder &&
+      lhs.peft_model_id == rhs.peft_model_id &&
+      lhs.lora_alpha == rhs.lora_alpha &&
+      lhs.lora_dropout == rhs.lora_dropout &&
+      lhs.target_modules.size() == rhs.target_modules.size() &&
       lhs.load_weights_from_file == rhs.load_weights_from_file) {
-    for (int i=0; i<lhs.target_modules.size(); i++) {
+    for (int i = 0; i < lhs.target_modules.size(); i++) {
       if (lhs.target_modules[i] != rhs.target_modules[i]) {
         return false;
       }
@@ -75,7 +78,7 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   os << "lora_alpha: " << llc.lora_alpha << ", ";
   os << "lora_dropout: " << llc.lora_dropout << ", ";
   os << "target_modules: [";
-  for (int i=0; i<llc.target_modules.size(); i++) {
+  for (int i = 0; i < llc.target_modules.size(); i++) {
     os << llc.target_modules[i];
     if (i < llc.target_modules.size() - 1) {
       os << ", ";

From 05df07d0e44262ee81beb7a425229dfb0eca7751 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 3 Mar 2024 02:14:31 +0000
Subject: [PATCH 13/32] fix

---
 inference/incr_decoding/incr_decoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index e7d4cf16fb..61d56e62e1 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -231,7 +231,7 @@ void FlexFlow::top_level_task(Task const *task,
   LoraLinearConfig peft_config =
       peft_model_name.empty()
           ? LoraLinearConfig::EmptyConfig
-          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+          : LoraLinearConfig(join_path({file_paths.cache_folder_path, "configs"}), peft_model_name);
 
   GenerationConfig generationConfig(do_sample, temperature, topp);
   RequestManager *rm = RequestManager::get_request_manager();

From 6d41b29d99b112b3251dceb22c1700c9b85f415c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 3 Mar 2024 03:10:32 +0000
Subject: [PATCH 14/32] fix

---
 config/config.linux    |   2 +-
 src/ops/lora_linear.cc | 125 +++++++++++++++++++++--------------------
 2 files changed, 65 insertions(+), 62 deletions(-)

diff --git a/config/config.linux b/config/config.linux
index 30edfa7dfe..4c70f95d3f 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -83,7 +83,7 @@ FF_MAX_DIM=${FF_MAX_DIM:-5}
 BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY:-OFF}
 
 # set LEGION_MAX_RETURN_SIZE
-LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-262144}
+LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-2097152}
 
 # set ROCM path
 ROCM_PATH=${ROCM_PATH:-"/opt/rocm"}
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 8a54709df6..3a597ad540 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -38,6 +38,16 @@ using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::LoraLinear;
 
+bool check_lora_layer_match(Layer *potential_target, std::string target_module_name) {
+  if (potential_target->op_type == OP_LINEAR && potential_target->name != nullptr && strlen(potential_target->name) > 0) {
+    std::string s(potential_target->name);
+    if (s.find(target_module_name) != std::string::npos && s.find("lora") == std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
   assert(config.enable_peft &&
          "Cannot add a LoRA layer if PEFT mode is not enabled");
@@ -51,71 +61,64 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
   for (std::string target_module_name : peft_config.target_modules) {
     assert(target_module_name.length() > 0 &&
            "LoRA target module name is empty");
-    // find target layer, and ensure uniqueness.
-    // if the target layer already has a LoRA layer, no need to add it again
-    // (keep track of layers with lora)
-    Layer *target_module = nullptr;
-    int idx;
-    for (Layer *it : layers) {
-      if (it->op_type == OP_LINEAR && it->name != nullptr &&
-          strlen(it->name) > 0) {
-        std::string s(it->name);
-        if (s.find(target_module_name) != std::string::npos) {
-          // Check that this is the only layer with target name
-          if (target_module != nullptr) {
-            fprintf(stderr,
-                    "Error, found two layers containing LoRA target module "
-                    "name '%s'. Layer 1: %s, Layer 2: %s\n",
-                    target_module_name.c_str(),
-                    target_module->name,
-                    it->name);
-            assert(false);
+    // find target layer
+    for (auto it = layers.begin(); it != layers.end(); ++it) {
+      Layer *target_module = *it;
+      bool match = check_lora_layer_match(target_module, target_module_name);
+      if (!match) continue;
+
+      if (base_layer_to_peft_layer.find(target_module) !=
+          base_layer_to_peft_layer.end()) {
+        // lora linear layer already added, no need to add again
+        Layer *peft_layer = base_layer_to_peft_layer[target_module];
+        peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
+      } else {
+        Tensor const input = target_module->inputs[0];
+        Tensor const output = target_module->outputs[0];
+        assert(input->data_type == output->data_type);
+        std::string name_ = target_module->name ? std::string(target_module->name)
+                                                : std::string("");
+        size_t last_underscore = name_.length() - 1;
+        for (int i = name_.length() - 1; i > 0; i--) {
+          if (!(std::isdigit(target_module->name[i]) || target_module->name[i] == '_')) {
+            break;
+          } else if (target_module->name[i] == '_') {
+            last_underscore = i;
           }
-          target_module = it;
         }
-      }
-      idx++;
-    }
-    Layer *peft_layer = nullptr;
-    if (base_layer_to_peft_layer.find(target_module) !=
-        base_layer_to_peft_layer.end()) {
-      // lora linear layer already added, no need to add again
-      peft_layer = base_layer_to_peft_layer[target_module];
-      peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
-    } else {
-      Tensor const input = target_module->inputs[0];
-      Tensor const output = target_module->outputs[0];
-      assert(input->data_type == output->data_type);
-      std::string name_ = target_module->name ? std::string(target_module->name)
-                                              : std::string("");
-      name_ += ".lora";
-      Layer *peft_layer = new Layer(this,
-                                    OP_LORA,
-                                    output->data_type,
-                                    name_.c_str(),
-                                    2 /*inputs*/,
-                                    0 /*weights*/,
-                                    1 /*outputs*/,
-                                    input,
-                                    output);
-      {
-        int numdims = output->num_dims;
-        int dims[MAX_TENSOR_DIM];
-        for (int i = 0; i < numdims; i++) {
-          dims[i] = output->dims[i];
+        name_.erase(last_underscore);
+
+        name_ += ".lora";
+        std::cout << "Adding layer " << name_ << std::endl;
+        Layer *peft_layer = new Layer(this,
+                                      OP_LORA,
+                                      output->data_type,
+                                      name_.c_str(),
+                                      2 /*inputs*/,
+                                      0 /*weights*/,
+                                      1 /*outputs*/,
+                                      input,
+                                      output);
+        {
+          int numdims = output->num_dims;
+          int dims[MAX_TENSOR_DIM];
+          for (int i = 0; i < numdims; i++) {
+            dims[i] = output->dims[i];
+          }
+          peft_layer->outputs[0] =
+              create_tensor_legion_ordering(numdims,
+                                            dims,
+                                            output->data_type,
+                                            peft_layer,
+                                            0,
+                                            true /*create_grad*/);
         }
-        peft_layer->outputs[0] =
-            create_tensor_legion_ordering(numdims,
-                                          dims,
-                                          output->data_type,
-                                          peft_layer,
-                                          0,
-                                          true /*create_grad*/);
+        layers.insert(it + 1, peft_layer);
+        ++it;
+        base_layer_to_peft_layer[target_module] = peft_layer;
+        peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
+        peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
       }
-      layers.insert(layers.begin() + idx + 1, peft_layer);
-      base_layer_to_peft_layer[target_module] = peft_layer;
-      peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
-      peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
     }
   }
 

From c5d735fe56efc556da33d8882c817bb6858894d3 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 12 Mar 2024 19:30:16 +0000
Subject: [PATCH 15/32] fix issues for downloading peft model

---
 inference/utils/download_peft_model.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
index 5c7704b6f0..ea4c96a05f 100644
--- a/inference/utils/download_peft_model.py
+++ b/inference/utils/download_peft_model.py
@@ -5,6 +5,9 @@
 
 def parse_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model_name", type=str, help="Name of the model to download"
+    )
     parser.add_argument(
         "peft_model_ids", type=str, nargs="+", help="Name of the model(s) to download"
     )
@@ -44,7 +47,14 @@ def main(args):
 
     for peft_model_id in args.peft_model_ids:
         for data_type in data_types:
+            llm = ff.LLM(
+                args.base_model_name,
+                data_type=data_type,
+                cache_path=args.cache_folder,
+                refresh_cache=args.refresh_cache,
+            )
             peft = ff.PEFT(
+                llm,
                 peft_model_id,
                 data_type=data_type,
                 cache_path=args.cache_folder,

From 5883eb635736a5f3684822aa221e15d84f6b34e3 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Fri, 15 Mar 2024 04:35:25 +0000
Subject: [PATCH 16/32] solved issues for download peft model

---
 python/flexflow/serve/serve.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 7e63b5055c..4c0502a2e7 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -154,7 +154,7 @@ def download_hf_config(self):
         print(f"Saving {self.model_name} configs to file {self.config_path}...")
         self.hf_config.to_json_file(self.config_path)
 
-    def __get_revision_hashes(self, model_name: str, weights: bool):
+    def _get_revision_hashes(self, model_name: str, weights: bool):
         ff_revision = None
         ff_revision_file = (
             os.path.join(self.weights_path, "rev_sha.txt")
@@ -201,7 +201,7 @@ def download_hf_weights_if_needed(self):
         os.makedirs(self.weights_path, exist_ok=True)
         #print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
 
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+        ff_revision, ff_revision_file, latest_revision = self._get_revision_hashes(
             self.model_name, weights=True
         )
 
@@ -541,6 +541,7 @@ def __init__(
         refresh_cache: bool = False,
     ):
         self.peft_model_id = peft_model_id
+        self.model_name = peft_model_id
         self.hf_config = config if config is not None else PeftConfig.from_pretrained(peft_model_id)
         self.peft_type = self.hf_config.peft_type
         if self.peft_type != "LORA":
@@ -581,8 +582,9 @@ def default(self, obj):
         
         self.base_model.download_hf_config()
 
-    def __get_revision_hashes(self, peft_model_id: str):
-        return super().__get_revision_hashes(peft_model_id, weights=True)
+    def _get_revision_hashes(self, peft_model_id: str, weights: bool):
+        model_name = self.peft_model_id
+        return super()._get_revision_hashes(model_name, weights)
 
     def convert_peft_model(self, hf_peft_model, weights_path):
         for name, params in hf_peft_model.named_parameters():
@@ -619,8 +621,9 @@ def download_hf_weights_if_needed(self):
         os.makedirs(self.weights_path, exist_ok=True)
         #print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
 
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.peft_model_id
+        ff_revision, ff_revision_file, latest_revision = self._get_revision_hashes(
+            self.peft_model_id,
+            True
         )
 
         # Download if needed

From ce44ff95bde86606731dfcd07b2eea5ca7ac44fb Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Wed, 20 Mar 2024 22:40:32 +0000
Subject: [PATCH 17/32] added printouts for debugging

---
 src/runtime/inference_manager.cc | 33 ++++++++++++++++++++++++++++++++
 src/runtime/request_manager.cc   | 28 +++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 91a6dab9b5..638ded2823 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -54,10 +54,28 @@ bool parallel_tensor_list_overlaps(std::vector<ParallelTensor> const &list1,
 }
 
 void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
+
+  // Check if the model object exists
+  if (model == nullptr) {
+    std::cout << "###PEFT DEBUGGING### Model object does not exist." << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### Model object exists." << std::endl;
+  }
+
   // TODO: currently assume there is a single data-parallel pipeline
   // (i.e., data-parallel-degree == 1)
   assert(model->config.data_parallelism_degree == 1);
   model->config.batchSize = BatchConfig::max_tokens_per_batch();
+
+  // Check if the model object exists after importing config
+  if (model == nullptr) {
+    std::cout << "###PEFT DEBUGGING### Model object does not exist after setting config and batch size." << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### Model object still exists." << std::endl;
+  }
+  
   model->compile_inference();
   Context ctx = model->config.lg_ctx;
   Runtime *runtime = model->config.lg_hlr;
@@ -609,17 +627,23 @@ void FFModel::set_position_offset(int offset) {
 }
 
 void FFModel::compile_inference() {
+  std::cout << "###PEFT DEBUGGING### Entering compile_inference." << std::endl;
+  
   // Request at least four CPU processors for inference runs
   assert(
       config.cpusPerNode >= 4 &&
       "FlexFlow Serve requires at least four CPU cores per node, please add "
       "`-ll:cpu 4` in the command line if you are using the C++ interface or "
       "set `num_cpus` in `ff.init` if you are using the Python interface");
+
+  std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node." << std::endl;
   Context ctx = config.lg_ctx;
   Runtime *runtime = config.lg_hlr;
   config.computationMode = COMP_MODE_INFERENCE;
   create_operators_from_layers();
+
   // Launch the graph optimize task
+  std::cout << "###PEFT DEBUGGING### Launching graph optimization task." << std::endl;
   {
     FFModel *model = this;
     TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID,
@@ -670,6 +694,11 @@ void FFModel::compile_inference() {
       }
     }
   }
+
+  std::cout << "###PEFT DEBUGGING### Operators reconstructed from optimized graph." << std::endl;
+  // Perform inplace optimizations
+  std::cout << "###PEFT DEBUGGING### Starting inplace optimizations." << std::endl;
+
   loss_op = nullptr;
   metrics_op = nullptr;
   // Perform inplace optimizations
@@ -709,6 +738,8 @@ void FFModel::compile_inference() {
     }
   }
 
+  // Output tensor mapping
+  std::cout << "###PEFT DEBUGGING### Mapping output tensors." << std::endl;
   for (size_t l = 0; l < operators.size(); l++) {
     Op *op = operators[l];
 
@@ -734,6 +765,7 @@ void FFModel::compile_inference() {
   }
 
 #ifdef FF_USE_NCCL
+  std::cout << "###PEFT DEBUGGING### Setting up NCCL communications." << std::endl;
   for (size_t l = 0; l < operators.size(); l++) {
     // Only create nccl for allreduce and fusedop for inference
     // (fusedop may include allreduces)
@@ -770,6 +802,7 @@ void FFModel::compile_inference() {
     }
   }
 #endif
+  std::cout << "###PEFT DEBUGGING### compile_inference completed successfully." << std::endl;
 }
 
 std::string join_path(std::vector<std::string> const &paths) {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e9e21df52a..535278a3d9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2450,6 +2450,16 @@ void RequestManager::background_serving_task(
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
+
+
+  auto print_timestamped_message = [](const std::string& message) {
+    auto now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+    std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - " << message << std::endl;
+  };
+
+  // Print at the start of the task
+  print_timestamped_message("###PEFT DEBUGGING### Starting background serving task.");
+
   RequestManager *rm = RequestManager::get_request_manager();
   FFModel *llm = *(FFModel **)task->args;
   {
@@ -2466,6 +2476,11 @@ void RequestManager::background_serving_task(
       ssm->config.lg_ctx = ctx;
     }
   }
+
+  // Checkpoint print
+  print_timestamped_message("###PEFT DEBUGGING### Updated models' configuration.");
+
+
   if (rm->get_num_ssms() == 0) {
     // No SSMs: perform incremental decoding
     rm->serve_incr_decoding(llm);
@@ -2473,6 +2488,10 @@ void RequestManager::background_serving_task(
     // Registered SSMs: perform speculative inference
     rm->serve_spec_infer(llm);
   }
+
+  // Print at the end of the task
+  print_timestamped_message("###PEFT DEBUGGING### Background serving task completed.");
+
 }
 
 std::string find_layer_name_from_guid(FFModel *model, LayerID guid) {
@@ -2497,6 +2516,15 @@ bool is_peft_operator_type(OperatorType type) {
 
 /*static*/
 void RequestManager::serve_incr_decoding(FFModel *llm) {
+
+  // Check if the model object exists
+  if (llm == nullptr) {
+    std::cout << "###PEFT DEBUGGING### LLM Model object does not exist." << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl;
+  }
+
   Context ctx = llm->config.lg_ctx;
   Runtime *runtime = llm->config.lg_hlr;
   // Compile the llm

From 4adf6ea8ac6ea5ec4f96498446c88f7cbbbc0f76 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 21 Mar 2024 05:13:41 +0000
Subject: [PATCH 18/32] fix

---
 inference/utils/download_peft_model.py |  1 +
 python/flexflow/serve/serve.py         | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
index ea4c96a05f..bc2ba59b30 100644
--- a/inference/utils/download_peft_model.py
+++ b/inference/utils/download_peft_model.py
@@ -62,6 +62,7 @@ def main(args):
             )
             peft.download_hf_weights_if_needed()
             peft.download_hf_config()
+            peft.download_hf_tokenizer_if_needed()
 
 
 if __name__ == "__main__":
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 4c0502a2e7..9997527f0d 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -154,7 +154,7 @@ def download_hf_config(self):
         print(f"Saving {self.model_name} configs to file {self.config_path}...")
         self.hf_config.to_json_file(self.config_path)
 
-    def _get_revision_hashes(self, model_name: str, weights: bool):
+    def __get_revision_hashes(self, model_name: str, weights: bool):
         ff_revision = None
         ff_revision_file = (
             os.path.join(self.weights_path, "rev_sha.txt")
@@ -201,7 +201,7 @@ def download_hf_weights_if_needed(self):
         os.makedirs(self.weights_path, exist_ok=True)
         #print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
 
-        ff_revision, ff_revision_file, latest_revision = self._get_revision_hashes(
+        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
             self.model_name, weights=True
         )
 
@@ -582,9 +582,9 @@ def default(self, obj):
         
         self.base_model.download_hf_config()
 
-    def _get_revision_hashes(self, peft_model_id: str, weights: bool):
+    def __get_revision_hashes(self, peft_model_id: str, weights: bool):
         model_name = self.peft_model_id
-        return super()._get_revision_hashes(model_name, weights)
+        return self._LLM__get_revision_hashes(model_name, weights)
 
     def convert_peft_model(self, hf_peft_model, weights_path):
         for name, params in hf_peft_model.named_parameters():
@@ -621,7 +621,7 @@ def download_hf_weights_if_needed(self):
         os.makedirs(self.weights_path, exist_ok=True)
         #print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
 
-        ff_revision, ff_revision_file, latest_revision = self._get_revision_hashes(
+        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
             self.peft_model_id,
             True
         )

From 53e8919dc65b2e4735cbc609a23813d55fda3f36 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 22 Mar 2024 19:45:40 +0000
Subject: [PATCH 19/32] fix seg fault

---
 config/config.linux                       |  2 +-
 include/flexflow/ops/lora_linear_params.h |  4 +-
 inference/incr_decoding/incr_decoding.cc  |  2 +-
 src/ops/lora_linear.cc                    | 97 +++++++++--------------
 src/ops/lora_linear_params.cc             | 14 ++--
 src/runtime/inference_manager.cc          | 31 +++++---
 src/runtime/request_manager.cc            | 23 +++---
 7 files changed, 81 insertions(+), 92 deletions(-)

diff --git a/config/config.linux b/config/config.linux
index 4c70f95d3f..30edfa7dfe 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -83,7 +83,7 @@ FF_MAX_DIM=${FF_MAX_DIM:-5}
 BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY:-OFF}
 
 # set LEGION_MAX_RETURN_SIZE
-LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-2097152}
+LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-262144}
 
 # set ROCM path
 ROCM_PATH=${ROCM_PATH:-"/opt/rocm"}
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index dfc78d0683..ff041334f1 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -17,7 +17,7 @@ class LoraLinearConfig {
   LoraLinearConfig(int rank,
                    OptimizerType type = OPTIMIZER_TYPE_SGD,
                    float learning_rate = 1e-4);
-  LoraLinearConfig(std::string const &config_folder_,
+  LoraLinearConfig(std::string const &cache_folder_,
                    std::string const &peft_model_id_);
   friend bool operator==(LoraLinearConfig const &lhs,
                          LoraLinearConfig const &rhs);
@@ -28,7 +28,7 @@ class LoraLinearConfig {
   int rank;
   OptimizerType optimizer_type;
   float learning_rate;
-  std::string config_folder;
+  std::string cache_folder;
   // Huggingface
   std::string peft_model_id;
   int lora_alpha;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 61d56e62e1..e7d4cf16fb 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -231,7 +231,7 @@ void FlexFlow::top_level_task(Task const *task,
   LoraLinearConfig peft_config =
       peft_model_name.empty()
           ? LoraLinearConfig::EmptyConfig
-          : LoraLinearConfig(join_path({file_paths.cache_folder_path, "configs"}), peft_model_name);
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
 
   GenerationConfig generationConfig(do_sample, temperature, topp);
   RequestManager *rm = RequestManager::get_request_manager();
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 3a597ad540..39934f4cce 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -38,10 +38,13 @@ using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::LoraLinear;
 
-bool check_lora_layer_match(Layer *potential_target, std::string target_module_name) {
-  if (potential_target->op_type == OP_LINEAR && potential_target->name != nullptr && strlen(potential_target->name) > 0) {
+bool check_lora_layer_match(Layer *potential_target,
+                            std::string target_module_name) {
+  if (potential_target->op_type == OP_LINEAR &&
+      potential_target->name != nullptr && strlen(potential_target->name) > 0) {
     std::string s(potential_target->name);
-    if (s.find(target_module_name) != std::string::npos && s.find("lora") == std::string::npos) {
+    if (s.find(target_module_name) != std::string::npos &&
+        s.find("lora") == std::string::npos) {
       return true;
     }
   }
@@ -65,7 +68,9 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
     for (auto it = layers.begin(); it != layers.end(); ++it) {
       Layer *target_module = *it;
       bool match = check_lora_layer_match(target_module, target_module_name);
-      if (!match) continue;
+      if (!match) {
+        continue;
+      }
 
       if (base_layer_to_peft_layer.find(target_module) !=
           base_layer_to_peft_layer.end()) {
@@ -76,11 +81,13 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
         Tensor const input = target_module->inputs[0];
         Tensor const output = target_module->outputs[0];
         assert(input->data_type == output->data_type);
-        std::string name_ = target_module->name ? std::string(target_module->name)
-                                                : std::string("");
+        std::string name_ = target_module->name
+                                ? std::string(target_module->name)
+                                : std::string("");
         size_t last_underscore = name_.length() - 1;
         for (int i = name_.length() - 1; i > 0; i--) {
-          if (!(std::isdigit(target_module->name[i]) || target_module->name[i] == '_')) {
+          if (!(std::isdigit(target_module->name[i]) ||
+                target_module->name[i] == '_')) {
             break;
           } else if (target_module->name[i] == '_') {
             last_underscore = i;
@@ -375,21 +382,22 @@ OpMeta *LoraLinear::init_task(Task const *task,
 
     // load weights from file
     std::string weights_folder_filepath = join_path({
-        lora_config.config_folder,
+        lora_config.cache_folder,
+        "weights",
         lora_config.peft_model_id,
         dt == DT_FLOAT ? "full-precision" : "half-precision",
     });
     std::string w0_filepath = join_path(
-        {weights_folder_filepath, lora_layername_substr + "_A_weight"});
+        {weights_folder_filepath, lora_layername_substr + "_A.weight"});
     std::string w1_filepath = join_path(
-        {weights_folder_filepath, lora_layername_substr + "_B_weight"});
+        {weights_folder_filepath, lora_layername_substr + "_B.weight"});
     if (dt == DT_FLOAT) {
-      std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight"
                 << ", size: " << w0_num_elements << ", shard: " << shard_id
                 << std::endl;
       load_peft_from_file(
           (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
-      std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight"
                 << ", size: " << w1_num_elements << ", shard: " << shard_id
                 << std::endl;
       load_peft_from_file((float *)weight.w1_ptr,
@@ -398,12 +406,12 @@ OpMeta *LoraLinear::init_task(Task const *task,
                           shard_id,
                           w1_filepath);
     } else if (dt == DT_HALF) {
-      std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight"
                 << ", size: " << w0_num_elements << ", shard: " << shard_id
                 << std::endl;
       load_peft_from_file(
           (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
-      std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight"
                 << ", size: " << w1_num_elements << ", shard: " << shard_id
                 << std::endl;
       load_peft_from_file(
@@ -818,25 +826,14 @@ void LoraLinear::serialize(Legion::Serializer &sez) const {
   for (auto const &kv : this->peft_configs) {
     // Serialize PEFTModelID
     sez.serialize(kv.first.id);
-    // Serialize LoraLinearConfig
-    sez.serialize(kv.second.rank);
-    sez.serialize(kv.second.optimizer_type);
-    sez.serialize(kv.second.learning_rate);
-    sez.serialize(kv.second.config_folder.length());
-    sez.serialize(kv.second.config_folder.c_str(),
-                  kv.second.config_folder.length());
+    // Serialize LoraConfig's cache folder
+    sez.serialize(kv.second.cache_folder.length());
+    sez.serialize(kv.second.cache_folder.c_str(),
+                  kv.second.cache_folder.length());
+    // Serialize LoraConfig's peft model id
     sez.serialize(kv.second.peft_model_id.length());
     sez.serialize(kv.second.peft_model_id.c_str(),
                   kv.second.peft_model_id.length());
-    sez.serialize(kv.second.lora_alpha);
-    sez.serialize(kv.second.lora_dropout);
-    sez.serialize(kv.second.target_modules.size());
-    sez.serialize(kv.second.load_weights_from_file);
-    for (int i = 0; i < kv.second.target_modules.size(); i++) {
-      sez.serialize(kv.second.target_modules[i].length());
-      sez.serialize(kv.second.target_modules[i].c_str(),
-                    kv.second.target_modules[i].length());
-    }
   }
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
@@ -867,44 +864,22 @@ Node LoraLinear::deserialize(FFModel &ff,
     size_t pid;
     dez.deserialize(pid);
     PEFTModelID peft_model_id(pid);
-    // Deserialize LoraLinearConfig
-    int rank;
-    OptimizerType optimizer_type;
-    float learning_rate;
-    dez.deserialize(rank);
-    dez.deserialize(optimizer_type);
-    dez.deserialize(learning_rate);
-    LoraLinearConfig lora_linear_config(rank, optimizer_type, learning_rate);
+
+    // Deserialize LoraConfig's cache folder
     size_t string_size;
     char buffer[4096] = {0};
-    // deserialize config_folder
     dez.deserialize(string_size);
     dez.deserialize(buffer, string_size);
-    lora_linear_config.config_folder = std::string(buffer);
+    std::string cache_folder = std::string(buffer);
+
+    // Deserialize LoraConfig's peft model id
     string_size = 0;
     memset(buffer, 0, 4096);
-    // deserialize peft_model_id
     dez.deserialize(string_size);
     dez.deserialize(buffer, string_size);
-    lora_linear_config.peft_model_id = std::string(buffer);
-    string_size = 0;
-    memset(buffer, 0, 4096);
-    // deserialize lora_alpha and lora_dropout
-    dez.deserialize(lora_linear_config.lora_alpha);
-    dez.deserialize(lora_linear_config.lora_dropout);
-    // deserialize target_modules
-    size_t num_target_modules = 0;
-    dez.deserialize(num_target_modules);
-    for (int i = 0; i < num_target_modules; i++) {
-      dez.deserialize(string_size);
-      dez.deserialize(buffer, string_size);
-      lora_linear_config.target_modules.push_back(std::string(buffer));
-      string_size = 0;
-      memset(buffer, 0, 4096);
-    }
-    // deserialize load_weights_from_file
-    dez.deserialize(lora_linear_config.load_weights_from_file);
-    // Append entry to list
+    std::string peft_model_name = std::string(buffer);
+
+    LoraLinearConfig lora_linear_config(cache_folder, peft_model_name);
     params.peft_configs.emplace(
         std::make_pair(peft_model_id, lora_linear_config));
   }
@@ -956,7 +931,7 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
     hash_combine(key, kv.second.rank);
     hash_combine(key, kv.second.optimizer_type);
     hash_combine(key, kv.second.learning_rate);
-    hash_combine(key, kv.second.config_folder);
+    hash_combine(key, kv.second.cache_folder);
     hash_combine(key, kv.second.peft_model_id);
     hash_combine(key, kv.second.lora_alpha);
     hash_combine(key, kv.second.lora_dropout);
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 771cf94906..1b142d5577 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -9,20 +9,20 @@ const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig();
 
 LoraLinearConfig::LoraLinearConfig()
     : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f),
-      config_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f),
+      cache_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f),
       load_weights_from_file(false) {}
 
 LoraLinearConfig::LoraLinearConfig(int _rank, OptimizerType _type, float _lr)
-    : rank(_rank), optimizer_type(_type), learning_rate(_lr), config_folder(""),
+    : rank(_rank), optimizer_type(_type), learning_rate(_lr), cache_folder(""),
       peft_model_id(""), lora_alpha(0), lora_dropout(0.0f),
       load_weights_from_file(false) {}
 
-LoraLinearConfig::LoraLinearConfig(std::string const &config_folder_,
+LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_,
                                    std::string const &peft_model_id_) {
-  config_folder = config_folder_;
+  cache_folder = cache_folder_;
   peft_model_id = peft_model_id_;
   std::string peft_inference_config_file_path =
-      join_path({config_folder, peft_model_id, "config.json"});
+      join_path({cache_folder, "configs", peft_model_id, "config.json"});
   std::ifstream config_file(peft_inference_config_file_path);
   if (config_file.is_open()) {
     try {
@@ -52,7 +52,7 @@ LoraLinearConfig::LoraLinearConfig(std::string const &config_folder_,
 bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) {
   if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type &&
       lhs.learning_rate == rhs.learning_rate &&
-      lhs.config_folder == rhs.config_folder &&
+      lhs.cache_folder == rhs.cache_folder &&
       lhs.peft_model_id == rhs.peft_model_id &&
       lhs.lora_alpha == rhs.lora_alpha &&
       lhs.lora_dropout == rhs.lora_dropout &&
@@ -73,7 +73,7 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   os << "rank: " << llc.rank << ", ";
   os << "optimizer_type: " << llc.optimizer_type << ", ";
   os << "learning_rate: " << llc.learning_rate << ", ";
-  os << "config_folder: " << llc.config_folder << ", ";
+  os << "cache_folder: " << llc.cache_folder << ", ";
   os << "peft_model_id: " << llc.peft_model_id << ", ";
   os << "lora_alpha: " << llc.lora_alpha << ", ";
   os << "lora_dropout: " << llc.lora_dropout << ", ";
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 638ded2823..212d0ebf6b 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -57,7 +57,8 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
 
   // Check if the model object exists
   if (model == nullptr) {
-    std::cout << "###PEFT DEBUGGING### Model object does not exist." << std::endl;
+    std::cout << "###PEFT DEBUGGING### Model object does not exist."
+              << std::endl;
     return; // Early return to prevent further operations on a nullptr
   } else {
     std::cout << "###PEFT DEBUGGING### Model object exists." << std::endl;
@@ -70,12 +71,14 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
 
   // Check if the model object exists after importing config
   if (model == nullptr) {
-    std::cout << "###PEFT DEBUGGING### Model object does not exist after setting config and batch size." << std::endl;
+    std::cout << "###PEFT DEBUGGING### Model object does not exist after "
+                 "setting config and batch size."
+              << std::endl;
     return; // Early return to prevent further operations on a nullptr
   } else {
     std::cout << "###PEFT DEBUGGING### Model object still exists." << std::endl;
   }
-  
+
   model->compile_inference();
   Context ctx = model->config.lg_ctx;
   Runtime *runtime = model->config.lg_hlr;
@@ -628,7 +631,7 @@ void FFModel::set_position_offset(int offset) {
 
 void FFModel::compile_inference() {
   std::cout << "###PEFT DEBUGGING### Entering compile_inference." << std::endl;
-  
+
   // Request at least four CPU processors for inference runs
   assert(
       config.cpusPerNode >= 4 &&
@@ -636,14 +639,17 @@ void FFModel::compile_inference() {
       "`-ll:cpu 4` in the command line if you are using the C++ interface or "
       "set `num_cpus` in `ff.init` if you are using the Python interface");
 
-  std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node." << std::endl;
+  std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four "
+               "CPU cores per node."
+            << std::endl;
   Context ctx = config.lg_ctx;
   Runtime *runtime = config.lg_hlr;
   config.computationMode = COMP_MODE_INFERENCE;
   create_operators_from_layers();
 
   // Launch the graph optimize task
-  std::cout << "###PEFT DEBUGGING### Launching graph optimization task." << std::endl;
+  std::cout << "###PEFT DEBUGGING### Launching graph optimization task."
+            << std::endl;
   {
     FFModel *model = this;
     TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID,
@@ -695,9 +701,12 @@ void FFModel::compile_inference() {
     }
   }
 
-  std::cout << "###PEFT DEBUGGING### Operators reconstructed from optimized graph." << std::endl;
+  std::cout
+      << "###PEFT DEBUGGING### Operators reconstructed from optimized graph."
+      << std::endl;
   // Perform inplace optimizations
-  std::cout << "###PEFT DEBUGGING### Starting inplace optimizations." << std::endl;
+  std::cout << "###PEFT DEBUGGING### Starting inplace optimizations."
+            << std::endl;
 
   loss_op = nullptr;
   metrics_op = nullptr;
@@ -765,7 +774,8 @@ void FFModel::compile_inference() {
   }
 
 #ifdef FF_USE_NCCL
-  std::cout << "###PEFT DEBUGGING### Setting up NCCL communications." << std::endl;
+  std::cout << "###PEFT DEBUGGING### Setting up NCCL communications."
+            << std::endl;
   for (size_t l = 0; l < operators.size(); l++) {
     // Only create nccl for allreduce and fusedop for inference
     // (fusedop may include allreduces)
@@ -802,7 +812,8 @@ void FFModel::compile_inference() {
     }
   }
 #endif
-  std::cout << "###PEFT DEBUGGING### compile_inference completed successfully." << std::endl;
+  std::cout << "###PEFT DEBUGGING### compile_inference completed successfully."
+            << std::endl;
 }
 
 std::string join_path(std::vector<std::string> const &paths) {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 535278a3d9..31742bd826 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2451,14 +2451,16 @@ void RequestManager::background_serving_task(
     Context ctx,
     Runtime *runtime) {
 
-
-  auto print_timestamped_message = [](const std::string& message) {
-    auto now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
-    std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - " << message << std::endl;
+  auto print_timestamped_message = [](std::string const &message) {
+    auto now =
+        std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+    std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - "
+              << message << std::endl;
   };
 
   // Print at the start of the task
-  print_timestamped_message("###PEFT DEBUGGING### Starting background serving task.");
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Starting background serving task.");
 
   RequestManager *rm = RequestManager::get_request_manager();
   FFModel *llm = *(FFModel **)task->args;
@@ -2478,8 +2480,8 @@ void RequestManager::background_serving_task(
   }
 
   // Checkpoint print
-  print_timestamped_message("###PEFT DEBUGGING### Updated models' configuration.");
-
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Updated models' configuration.");
 
   if (rm->get_num_ssms() == 0) {
     // No SSMs: perform incremental decoding
@@ -2490,8 +2492,8 @@ void RequestManager::background_serving_task(
   }
 
   // Print at the end of the task
-  print_timestamped_message("###PEFT DEBUGGING### Background serving task completed.");
-
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Background serving task completed.");
 }
 
 std::string find_layer_name_from_guid(FFModel *model, LayerID guid) {
@@ -2519,7 +2521,8 @@ void RequestManager::serve_incr_decoding(FFModel *llm) {
 
   // Check if the model object exists
   if (llm == nullptr) {
-    std::cout << "###PEFT DEBUGGING### LLM Model object does not exist." << std::endl;
+    std::cout << "###PEFT DEBUGGING### LLM Model object does not exist."
+              << std::endl;
     return; // Early return to prevent further operations on a nullptr
   } else {
     std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl;

From ebf8bd95340b224e5306ee33021406add10251ea Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 22 Mar 2024 20:30:24 +0000
Subject: [PATCH 20/32] add test, separate peft script in cpp

---
 CMakeLists.txt                           |   1 +
 inference/incr_decoding/incr_decoding.cc |  41 +--
 inference/peft/CMakeLists.txt            |  38 +++
 inference/peft/Makefile                  |  37 +++
 inference/peft/peft.cc                   | 325 +++++++++++++++++++++++
 tests/peft_test.sh                       |  28 ++
 6 files changed, 435 insertions(+), 35 deletions(-)
 create mode 100644 inference/peft/CMakeLists.txt
 create mode 100644 inference/peft/Makefile
 create mode 100644 inference/peft/peft.cc
 create mode 100755 tests/peft_test.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43ce4f7044..22770b6c28 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -558,6 +558,7 @@ if(NOT BUILD_LEGION_ONLY)
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
     add_subdirectory(inference/incr_decoding)
+    add_subdirectory(inference/peft)
   endif()
 
 
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index e7d4cf16fb..177a4dd156 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -40,7 +40,6 @@ void parse_input_args(char **argv,
                       int argc,
                       FilePaths &paths,
                       std::string &llm_model_name,
-                      std::string &peft_model_name,
                       bool &use_full_precision,
                       bool &verbose,
                       bool &do_sample,
@@ -59,17 +58,6 @@ void parse_input_args(char **argv,
       }
       continue;
     }
-    if (!strcmp(argv[i], "-enable-peft")) {
-      enable_peft = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "-peft-model")) {
-      peft_model_name = std::string(argv[++i]);
-      for (char &c : peft_model_name) {
-        c = std::tolower(c);
-      }
-      continue;
-    }
     // cache folder
     if (!strcmp(argv[i], "-cache-folder")) {
       paths.cache_folder_path = std::string(argv[++i]);
@@ -138,7 +126,7 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "Doesn't support quantization in non-offload mode");
   }
   FilePaths file_paths;
-  std::string llm_model_name, peft_model_name;
+  std::string llm_model_name;
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
@@ -156,7 +144,6 @@ void FlexFlow::top_level_task(Task const *task,
                    argc,
                    file_paths,
                    llm_model_name,
-                   peft_model_name,
                    use_full_precision,
                    verbose,
                    do_sample,
@@ -166,6 +153,7 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length);
+
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
@@ -280,13 +268,6 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
-  // Add PEFT layer
-  PEFTModelID peft_model_id = PEFTModelID::NO_ID;
-  if (!peft_model_name.empty()) {
-    peft_model_id = model.add_lora_layer(peft_config);
-  }
-
-  // Start background server
   rm->start_background_server(&model);
 
   int total_num_requests = 0;
@@ -303,20 +284,10 @@ void FlexFlow::top_level_task(Task const *task,
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
-      // Add inference request
-      // Request inference_req;
-      // inference_req.prompt = text;
-      // inference_req.max_sequence_length = 128;
-      // inference_req.peft_model_id = peft_model_id;
-      // requests.push_back(inference_req);
-      // total_num_requests++;
-      // Add fine-tuning request
-      Request fine_tuning_req;
-      fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
-      fine_tuning_req.max_sequence_length = 128;
-      fine_tuning_req.peft_model_id = peft_model_id;
-      fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
-      requests.push_back(fine_tuning_req);
+      Request inference_req;
+      inference_req.prompt = text;
+      inference_req.max_sequence_length = 128;
+      requests.push_back(inference_req);
       total_num_requests++;
     }
     std::vector<GenerationResult> result = model.generate(requests);
diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt
new file mode 100644
index 0000000000..4547907176
--- /dev/null
+++ b/inference/peft/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(FlexFlow_Peft)
+set(project_target peft)
+
+
+set(CPU_SRC
+  ${FLEXFLOW_CPP_DRV_SRC}
+  peft.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target} ${CPU_SRC})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target} ${CPU_SRC})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+
+set(BIN_DEST "bin")
+install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/inference/peft/Makefile b/inference/peft/Makefile
new file mode 100644
index 0000000000..0e4b79f51f
--- /dev/null
+++ b/inference/peft/Makefile
@@ -0,0 +1,37 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Flags for directing the runtime makefile what to include
+DEBUG           ?= 0		# Include debugging symbols
+MAX_DIM         ?= 4		# Maximum number of dimensions
+OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
+USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
+USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
+USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
+ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
+
+# Put the binary file name here
+OUTFILE		?= llama_pipeline
+# List all the application source files here
+ifndef CUDA_HOME
+CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1))
+endif
+
+
+ifndef FF_HOME
+$(error FF_HOME variable is not defined, aborting build)
+endif
+
+include $(FF_HOME)/FlexFlow.mk
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
new file mode 100644
index 0000000000..d376c3e39c
--- /dev/null
+++ b/inference/peft/peft.cc
@@ -0,0 +1,325 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+LegionRuntime::Logger::Category log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    paths.cache_folder_path = "~/.cache/flexflow";
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(max_requests_per_batch);
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Register PEFT layer
+  LoraLinearConfig mlp_second =
+      peft_model_name.empty()
+          ? LoraLinearConfig::DefaultConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+  PEFTModelID peft_model_id =
+      peft_model_name.empty()
+          ? PEFTModelID::NO_ID
+          : model.register_peft_model(
+                LoraLinearConfig::DefaultConfig /*mlp_first*/,
+                mlp_second /*mlp_second*/);
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  int total_num_requests = 0;
+  {
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+
+    std::vector<Request> requests;
+    for (auto &prompt : prompt_json) {
+      std::string text = prompt.get<std::string>();
+      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      // Add inference request
+      // Request inference_req;
+      // inference_req.prompt = text;
+      // inference_req.max_sequence_length = 128;
+      // inference_req.peft_model_id = peft_model_id;
+      // requests.push_back(inference_req);
+      // total_num_requests++;
+      // Add fine-tuning request
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
+      fine_tuning_req.max_sequence_length = 128;
+      fine_tuning_req.peft_model_id = peft_model_id;
+      fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
+      requests.push_back(fine_tuning_req);
+      total_num_requests++;
+    }
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  // float* data
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
new file mode 100755
index 0000000000..8f6d53725b
--- /dev/null
+++ b/tests/peft_test.sh
@@ -0,0 +1,28 @@
+#! /usr/bin/env bash
+set -x
+set -e
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}"
+
+# Token to access private huggingface models (e.g. LLAMA-2)
+HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none}
+if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then
+    huggingface-cli login --token "$HUGGINGFACE_TOKEN"
+fi
+
+# Create test prompt file
+mkdir -p ../inference/prompt
+echo '["Two things are infinite: "]' > ../inference/prompt/peft.json
+
+# Create output folder
+mkdir -p ../inference/output
+
+# Enable backtrace in case we run into a segfault or assertion failure
+export LEGION_BACKTRACE=1
+
+# Download test model
+python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m 
+# if first time, add: --refresh-cache
+
+./inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft

From acef0067ade7d50e144cceabcd17caf534133622 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 22 Mar 2024 20:36:19 +0000
Subject: [PATCH 21/32] fix

---
 inference/incr_decoding/incr_decoding.cc | 17 -----------
 inference/peft/peft.cc                   | 37 +++++++++++++++++-------
 tests/peft_test.sh                       |  2 +-
 3 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 177a4dd156..c3993b1ad4 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -43,7 +43,6 @@ void parse_input_args(char **argv,
                       bool &use_full_precision,
                       bool &verbose,
                       bool &do_sample,
-                      bool &enable_peft,
                       float &temperature,
                       float &topp,
                       int &max_requests_per_batch,
@@ -130,7 +129,6 @@ void FlexFlow::top_level_task(Task const *task,
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
-  bool enable_peft = false;
   float temperature = 0.0f;
   float topp = 0.0f;
   int max_requests_per_batch = 8;
@@ -147,7 +145,6 @@ void FlexFlow::top_level_task(Task const *task,
                    use_full_precision,
                    verbose,
                    do_sample,
-                   enable_peft,
                    temperature,
                    topp,
                    max_requests_per_batch,
@@ -173,14 +170,6 @@ void FlexFlow::top_level_task(Task const *task,
               << std::endl;
     assert(false);
   }
-  if (enable_peft && peft_model_name.empty()) {
-    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
-    assert(false);
-  } else if (!enable_peft && !peft_model_name.empty()) {
-    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
-    assert(false);
-  }
-
   json model_config = json::parse(config_file_handle,
                                   /*parser_callback_t */ nullptr,
                                   /*allow_exceptions */ true,
@@ -215,12 +204,6 @@ void FlexFlow::top_level_task(Task const *task,
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
 
-  // load PEFT config
-  LoraLinearConfig peft_config =
-      peft_model_name.empty()
-          ? LoraLinearConfig::EmptyConfig
-          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
-
   GenerationConfig generationConfig(do_sample, temperature, topp);
   RequestManager *rm = RequestManager::get_request_manager();
   rm->set_max_requests_per_batch(max_requests_per_batch);
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index d376c3e39c..e7d4cf16fb 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -44,6 +44,7 @@ void parse_input_args(char **argv,
                       bool &use_full_precision,
                       bool &verbose,
                       bool &do_sample,
+                      bool &enable_peft,
                       float &temperature,
                       float &topp,
                       int &max_requests_per_batch,
@@ -58,6 +59,10 @@ void parse_input_args(char **argv,
       }
       continue;
     }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
     if (!strcmp(argv[i], "-peft-model")) {
       peft_model_name = std::string(argv[++i]);
       for (char &c : peft_model_name) {
@@ -137,6 +142,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
+  bool enable_peft = false;
   float temperature = 0.0f;
   float topp = 0.0f;
   int max_requests_per_batch = 8;
@@ -154,6 +160,7 @@ void FlexFlow::top_level_task(Task const *task,
                    use_full_precision,
                    verbose,
                    do_sample,
+                   enable_peft,
                    temperature,
                    topp,
                    max_requests_per_batch,
@@ -178,6 +185,14 @@ void FlexFlow::top_level_task(Task const *task,
               << std::endl;
     assert(false);
   }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
   json model_config = json::parse(config_file_handle,
                                   /*parser_callback_t */ nullptr,
                                   /*allow_exceptions */ true,
@@ -212,6 +227,12 @@ void FlexFlow::top_level_task(Task const *task,
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
 
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
   GenerationConfig generationConfig(do_sample, temperature, topp);
   RequestManager *rm = RequestManager::get_request_manager();
   rm->set_max_requests_per_batch(max_requests_per_batch);
@@ -259,17 +280,11 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
-  // Register PEFT layer
-  LoraLinearConfig mlp_second =
-      peft_model_name.empty()
-          ? LoraLinearConfig::DefaultConfig
-          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
-  PEFTModelID peft_model_id =
-      peft_model_name.empty()
-          ? PEFTModelID::NO_ID
-          : model.register_peft_model(
-                LoraLinearConfig::DefaultConfig /*mlp_first*/,
-                mlp_second /*mlp_second*/);
+  // Add PEFT layer
+  PEFTModelID peft_model_id = PEFTModelID::NO_ID;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
 
   // Start background server
   rm->start_background_server(&model);
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index 8f6d53725b..778b225a26 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -25,4 +25,4 @@ export LEGION_BACKTRACE=1
 python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m 
 # if first time, add: --refresh-cache
 
-./inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft
+../build/inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft

From 660bf732f068d1427ceab666a8b2b7fada399d20 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 22 Mar 2024 22:24:35 +0000
Subject: [PATCH 22/32] fixes

---
 src/ops/inc_multihead_self_attention.cu | 3 ++-
 src/runtime/request_manager.cc          | 7 ++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 83fdbaf927..8b0776fde4 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1488,7 +1488,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) {
+    if (bc->request_completed[i] ||
+        (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 31742bd826..03157bcbbe 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2410,7 +2410,12 @@ std::vector<GenerationResult>
   RequestManager *rm = RequestManager::get_request_manager();
   std::vector<RequestManager::RequestGuid> guids;
   for (int i = 0; i < requests.size(); i++) {
-    RequestManager::RequestGuid guid = rm->register_new_request(requests.at(i));
+    RequestManager::RequestGuid guid;
+    if (requests.at(i).req_type == Request::REQ_INFERENCE) {
+      guid = rm->register_new_request(requests.at(i));
+    } else {
+      guid = rm->register_new_peft_request(requests.at(i));
+    }
     if (guid != RequestManager::INVALID_GUID) {
       guids.push_back(guid);
     }

From 02985cef3af5ba5e55ee2f02e859b70c71f1569e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 23 Mar 2024 02:51:53 +0000
Subject: [PATCH 23/32] fix

---
 src/runtime/request_manager.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 03157bcbbe..c335cd246b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -412,6 +412,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         if (request.completed_training_steps == request.max_training_steps) {
           // check if the fine tuning request has completed
           request.status = Request::COMPLETED;
+          trigger_request_completion_future(request.guid);
           log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%d)",
                             old_bc.requestsInfo[i].request_guid,
                             request.completed_training_steps);

From 084732e9e4390d251fca54273d8e6c4f4c52684f Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 23 Mar 2024 23:26:37 +0000
Subject: [PATCH 24/32] update peft python interface

---
 include/flexflow/flexflow_c.h          |  22 ++
 include/flexflow/model.h               |   2 +-
 inference/peft/peft.cc                 |   9 +-
 inference/utils/download_peft_model.py |  33 +-
 python/flexflow/core/flexflow_cffi.py  |  34 ++
 python/flexflow/serve/serve.py         | 484 +++++++------------------
 src/c/flexflow_c.cc                    |  64 ++++
 src/ops/lora_linear.cc                 |  12 +-
 8 files changed, 284 insertions(+), 376 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index b7b20f2d2f..1ceea59839 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -55,6 +55,8 @@ FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t);
 FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);
+FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t);
 
 // -----------------------------------------------------------------------
 // FFConfig
@@ -1036,6 +1038,26 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
 void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_);
 
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+    flexflow_lora_linear_config_create(char const *cache_folder_,
+                                       char const *peft_model_id_);
+
+void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_);
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create();
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id);
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 74421ffc92..099e2209e4 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -837,7 +837,7 @@ class FFModel {
   // ========================================
   // PEFT Layers
   // ========================================
-  PEFTModelID add_lora_layer(LoraLinearConfig const peft_config);
+  PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
   // ========================================
   // Inference APIs
   // ========================================
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index e7d4cf16fb..aa5581ca87 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -281,7 +281,7 @@ void FlexFlow::top_level_task(Task const *task,
   }
 
   // Add PEFT layer
-  PEFTModelID peft_model_id = PEFTModelID::NO_ID;
+  PEFTModelID *peft_model_id = nullptr;
   if (!peft_model_name.empty()) {
     peft_model_id = model.add_lora_layer(peft_config);
   }
@@ -314,7 +314,8 @@ void FlexFlow::top_level_task(Task const *task,
       Request fine_tuning_req;
       fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
       fine_tuning_req.max_sequence_length = 128;
-      fine_tuning_req.peft_model_id = peft_model_id;
+      fine_tuning_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
       fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
       requests.push_back(fine_tuning_req);
       total_num_requests++;
@@ -331,6 +332,10 @@ void FlexFlow::top_level_task(Task const *task,
     future.get_void_result();
   }
 
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
+
   // float* data
   std::cout << "----------inference finished--------------" << std::endl;
 
diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
index bc2ba59b30..ad79816f84 100644
--- a/inference/utils/download_peft_model.py
+++ b/inference/utils/download_peft_model.py
@@ -9,7 +9,7 @@ def parse_args():
         "--base_model_name", type=str, help="Name of the model to download"
     )
     parser.add_argument(
-        "peft_model_ids", type=str, nargs="+", help="Name of the model(s) to download"
+        "peft_model_ids", type=str, nargs="+", help="Name of the PEFT model(s) to download"
     )
     parser.add_argument(
         "--cache-folder",
@@ -45,24 +45,19 @@ def main(args):
     else:
         data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
 
-    for peft_model_id in args.peft_model_ids:
-        for data_type in data_types:
-            llm = ff.LLM(
-                args.base_model_name,
-                data_type=data_type,
-                cache_path=args.cache_folder,
-                refresh_cache=args.refresh_cache,
-            )
-            peft = ff.PEFT(
-                llm,
-                peft_model_id,
-                data_type=data_type,
-                cache_path=args.cache_folder,
-                refresh_cache=args.refresh_cache,
-            )
-            peft.download_hf_weights_if_needed()
-            peft.download_hf_config()
-            peft.download_hf_tokenizer_if_needed()
+    
+    for data_type in data_types:
+        llm = ff.LLM(
+            args.base_model_name,
+            data_type=data_type,
+            cache_path=args.cache_folder,
+            refresh_cache=args.refresh_cache,
+        )
+        for peft_model_id in args.peft_model_ids:
+            llm.add_peft(peft_model_id)
+        llm.download_hf_weights_if_needed()
+        llm.download_hf_config()
+        llm.download_hf_tokenizer_if_needed()
 
 
 if __name__ == "__main__":
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index b92a0a92af..ef0ee0e378 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -4287,3 +4287,37 @@ def load_weights(self, model):
         ffc().flexflow_file_data_loader_load_weights(
             self.handle, model.handle
         )
+
+# -----------------------------------------------------------------------
+# LoraLinearConfig
+# -----------------------------------------------------------------------
+        
+class LoraLinearConfig(object):
+    __slots__ = ["handle", "_handle"]
+
+    def __init__(
+        self,
+        cache_folder,
+        peft_model_id,
+    ):
+        c_cache_folder = get_c_name(cache_folder)
+        peft_model_id = get_c_name(peft_model_id)
+        self.handle = ffc().flexflow_lora_linear_config_create(
+            c_cache_folder,
+            peft_model_id,
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_lora_linear_config_destroy)
+
+# -----------------------------------------------------------------------
+# PEFTModelID
+# -----------------------------------------------------------------------
+        
+class PEFTModelID(object):
+    __slots__ = ["handle", "_handle"]
+
+    def __init__(self, id=None):
+        if id is None:
+            self.handle = ffc().flexflow_peft_model_id_create()
+        else:
+            self.handle = ffc().flexflow_peft_model_id_create_id(id)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy)
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 9997527f0d..bc7a796315 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -137,30 +137,61 @@ def __init__(
         self.refresh_cache = refresh_cache
         self.output_file = output_file
         self.rm = None
+        self.pefts = []
 
     def __del__(self):
         # Stop the background server before deleting the object
         if type(self) == LLM and self.rm is not None:
             self.rm.stop_server()
 
+    def add_peft(self, peft_model_id: str):
+        """Add a previously created PEFT adapter to the LLM. The PEFT model should already exist locally or be available on HuggingFace"""
+        peft_config = PeftConfig.from_pretrained(peft_model_id)
+        peft_type = peft_config.peft_type
+        if peft_type != "LORA":
+            raise RuntimeError(f"PEFT type {peft_type} not yet supported in FlexFlow")
+        if "base_model_name_or_path" not in peft_config.to_dict():
+            raise ValueError(
+                f"PEFT model {peft_model_id} does not have an associated base model"
+            )
+        if peft_config.base_model_name_or_path != self.model_name:
+            raise RuntimeError(f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}")
+        ff_peft_config = LoraLinearConfig(self.cache_path, peft_model_id)
+        peft_dict = {
+            "peft_config": peft_config,
+            "peft_type": peft_type,
+            "ff_peft_config": ff_peft_config,
+        }
+        self.pefts[peft_model_id] = peft_dict
+
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
-        self.config_dir = os.path.join(
+        config_dir = os.path.join(
             os.path.expanduser(self.cache_path), "configs", self.model_name.lower()
         )
-        self.config_path = os.path.join(self.config_dir, "config.json")
-        os.makedirs(self.config_dir, exist_ok=True)
-        print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
-        print(f"Saving {self.model_name} configs to file {self.config_path}...")
-        self.hf_config.to_json_file(self.config_path)
-
-    def __get_revision_hashes(self, model_name: str, weights: bool):
+        config_path = os.path.join(config_dir, "config.json")
+        os.makedirs(config_dir, exist_ok=True)
+        print(f"Creating directory {config_dir} (if it doesn't exist)...")
+        print(f"Saving {self.model_name} configs to file {config_path}...")
+        self.hf_config.to_json_file(config_path)
+        
+        # Save PEFT configs if the LLM has any registered PEFTs
+        for peft_model_id, peft_dict in self.pefts.items():
+            peft_config = peft_dict["hf_config"]
+            peft_config_path = os.path.join(os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower())
+            print(f"Saving {peft_model_id} configs to file {peft_config_path}...")
+            with open(peft_config_path, "w") as json_file:
+                class SetEncoder(json.JSONEncoder):
+                    def default(self, obj):
+                        if isinstance(obj, set):
+                            return list(obj)
+                        return super().default(obj)
+                json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder)
+
+    def __get_revision_hashes(self, model_name: str, folder: str):
         ff_revision = None
-        ff_revision_file = (
-            os.path.join(self.weights_path, "rev_sha.txt")
-            if weights
-            else os.path.join(self.tokenizer_path, "rev_sha.txt")
-        )
+        ff_revision_file = os.path.join(folder, "rev_sha.txt")
+            
         if os.path.exists(ff_revision_file):
             ff_revision = "".join(open(ff_revision_file).read().split())
 
@@ -180,46 +211,31 @@ def __get_revision_hashes(self, model_name: str, weights: bool):
     def download_hf_weights_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
         If not, or if the refresh_cache parameter is set to True, download new weights.
+
+        If any PEFT adapter is registered, perform the same operation for PEFT.
         """
-        # Use local cache, or download new version
-        self.weights_path = os.path.join(
-            os.path.expanduser(self.cache_path),
-            "weights",
-            self.model_name.lower(),
-            (
-                "full-precision"
-                if self.data_type == DataType.DT_FLOAT
-                else "half-precision"
-            ),
-        )
-        if self.refresh_cache:
-            print(
-                f"Refreshing weights in cache for model {self.model_name} at path {self.weights_path} ..."
+        def get_weights_path(model_name):
+            return os.path.join(os.path.expanduser(self.cache_path), "weights", model_name.lower(),
+                (
+                    "full-precision"
+                    if self.data_type == DataType.DT_FLOAT
+                    else "half-precision"
+                ),
             )
-            if os.path.exists(self.weights_path):
-                shutil.rmtree(self.weights_path)
-        os.makedirs(self.weights_path, exist_ok=True)
-        #print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
-
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, weights=True
-        )
 
-        # Download if needed
-        if ff_revision != latest_revision:
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                # Local model
+        def refresh_cache_if_needed(model_name):
+            weights_path = get_weights_path(model_name)
+            if self.refresh_cache:
                 print(
-                    f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..."
+                    f"Refreshing weights in cache for model {model_name} at path {weights_path} ..."
                 )
-            else:
-                # Remote model
-                print(
-                    f"'{self.model_name}' local model weights were updated! Converting new weights now..."
-                )
-            # Download model from HuggingFace, or load it from the local folder
-            hf_model = AutoModelForCausalLM.from_pretrained(
-                self.model_name,
+                if os.path.exists(weights_path):
+                    shutil.rmtree(weights_path)
+            os.makedirs(weights_path, exist_ok=True)
+        
+        def get_hf_llm(model_name):
+            return AutoModelForCausalLM.from_pretrained(
+                model_name,
                 trust_remote_code=True,
                 torch_dtype=(
                     torch.float32
@@ -227,21 +243,61 @@ def download_hf_weights_if_needed(self):
                     else torch.float16
                 ),
             )
-            # Print log message to notify user download of model has finished
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                print("Done downloading HF weights. Converting them now...")
-            # Convert the model to FlexFlow format
-            self.model_class.convert_hf_model(hf_model, self.weights_path)
-            # Save new revision hash to file
-            with open(ff_revision_file, "w+") as f:
-                f.write(latest_revision)
-            print("Done converting the weights...")
-            # Deallocate hf model
-            del hf_model
-            gc.collect()
-            torch.cuda.empty_cache()
-        else:
-            print(f"Loading '{self.model_name}' model weights from the cache...")
+        
+        def download_llm_weights():
+            weights_path = get_weights_path(self.model_name)
+            refresh_cache_if_needed(self.model_name)
+            ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights_path)
+            if ff_revision != latest_revision:
+                print(f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now...")
+                hf_model = get_hf_llm(self.model_name)
+                # Convert the model to FlexFlow format
+                self.model_class.convert_hf_model(hf_model, weights_path)
+                # Save new revision hash to file
+                with open(ff_revision_file, "w+") as f:
+                    f.write(latest_revision)
+                print(f"Done converting the weights for model {self.model_name}")
+                # Deallocate hf model
+                del hf_model
+                gc.collect()
+                torch.cuda.empty_cache()
+        
+        def convert_peft_model(hf_peft_model, peft_type, weights_path):
+            for name, params in hf_peft_model.named_parameters():
+                if peft_type.lower() in name:
+                    name = name.replace("base_model.model.model.", "").replace(
+                        ".default", ""
+                    )
+                    name = self.model_class.convert_hf_weight_name(name)
+                    params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+        
+        def download_peft_weights():
+            for peft_model_id, peft_dict in self.pefts.items():
+                peft_config = peft_dict["peft_config"]
+                peft_type = peft_config["peft_type"]
+                
+                weights_path = get_weights_path(peft_model_id)
+                refresh_cache_if_needed(peft_model_id)
+                ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(peft_model_id, weights_path)
+                
+                if ff_revision != latest_revision:
+                    print(f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now...")
+                    hf_model = get_hf_llm(peft_model_id)
+                    hf_peft_model = PeftModel.from_pretrained(hf_model, peft_model_id, config=peft_config)
+                    # Convert the model to FlexFlow format
+                    convert_peft_model(hf_peft_model, peft_type, weights_path)
+                    # Save new revision hash to file
+                    with open(ff_revision_file, "w+") as f:
+                        f.write(latest_revision)
+                    print(f"Done converting the weights for model {peft_model_id}")
+                    # Deallocate hf model
+                    del hf_peft_model
+                    del hf_model
+                    gc.collect()
+                    torch.cuda.empty_cache()
+        
+        download_llm_weights()
+        download_peft_weights()
 
     def download_hf_tokenizer_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date.
@@ -250,37 +306,24 @@ def download_hf_tokenizer_if_needed(self):
         print("Loading tokenizer...")
 
         # Use local cache, or download new version
-        self.tokenizer_path = os.path.join(
+        tokenizer_path = os.path.join(
             os.path.expanduser(self.cache_path),
             "tokenizers",
             self.model_name.lower(),
         )
         if self.refresh_cache:
-            print(
-                f"Discarding cached tokenizer files (if they exist) for model {self.model_name}..."
-            )
-            if os.path.exists(self.tokenizer_path):
-                shutil.rmtree(self.tokenizer_path)
-        if not os.path.exists(self.tokenizer_path):
-            print(f"Creating directory {self.tokenizer_path} (if it doesn't exist)...")
-            os.makedirs(self.tokenizer_path, exist_ok=True)
+            print(f"Refreshing cached tokenizer for model {self.model_name} at path {tokenizer_path} ...")
+            if os.path.exists(tokenizer_path):
+                shutil.rmtree(tokenizer_path)
+        if not os.path.exists(tokenizer_path):
+            print(f"Creating directory {tokenizer_path} (if it doesn't exist)...")
+            os.makedirs(tokenizer_path, exist_ok=True)
 
         # Get local revision SHA, check if it matches latest one on huggingface
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, weights=False
-        )
+        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, tokenizer_path)
 
         if ff_revision != latest_revision:
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                # Local model
-                print(
-                    f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..."
-                )
-            else:
-                # Remote model
-                print(
-                    f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..."
-                )
+            print(f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now...")
             # Download tokenizer from HuggingFace, or load it from the local folder
             if self.model_type == ModelType.LLAMA:
                 hf_tokenizer = LlamaTokenizer.from_pretrained(
@@ -288,19 +331,13 @@ def download_hf_tokenizer_if_needed(self):
                 )
             else:
                 hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            # Print log message to notify user download of tokenizer has finished
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                print("Done downloading tokenizer. Saving it now...")
             # Save tokenizer
-            hf_tokenizer.save_pretrained(self.tokenizer_path)
-            print("Done saving HF tokenizer.")
+            hf_tokenizer.save_pretrained(tokenizer_path)
+            print("Done updating HF tokenizer.")
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
                 f.write(latest_revision)
 
-        else:
-            print(f"Loading '{self.model_name}' tokenizer from the cache...")
-
     def compile(
         self,
         generation_config: GenerationConfig = GenerationConfig(),
@@ -378,6 +415,12 @@ def compile(
             max_tokens_per_batch,
         )
 
+        # Add PEFT layer if registered
+        for _, peft_dict in self.pefts.items():
+            ff_peft_config = peft_dict["ff_peft_config"]
+            ff_peft_model_id = self.model.add_lora_layer(ff_peft_config)
+            peft_dict["ff_peft_model_id"] = ff_peft_model_id
+
         # Download the weights from huggingface (if needed)
         self.download_hf_weights_if_needed()
 
@@ -526,258 +569,3 @@ def compile(
             model_specific_pipeline_parallelism_degree,
             ssms,
         )
-
-
-class PEFT(LLM):
-    """This class creates a PEFT (parameter-efficient transformer) object to be used in concert with a LLM or SSM"""
-
-    def __init__(
-        self,
-        base_model: LLM,
-        peft_model_id: str,
-        config: PeftConfig = None,
-        data_type: DataType = DataType.DT_HALF,
-        cache_path: str = "",
-        refresh_cache: bool = False,
-    ):
-        self.peft_model_id = peft_model_id
-        self.model_name = peft_model_id
-        self.hf_config = config if config is not None else PeftConfig.from_pretrained(peft_model_id)
-        self.peft_type = self.hf_config.peft_type
-        if self.peft_type != "LORA":
-            raise RuntimeError(
-                f"PEFT type {self.peft_type} not yet supported in FlexFlow"
-            )
-        self.data_type = data_type
-        assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT
-        self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
-        self.refresh_cache = refresh_cache
-        # Base model related
-        if "base_model_name_or_path" not in self.hf_config.to_dict():
-            raise ValueError(
-                f"PEFT model {peft_model_id} does not have an associated based model"
-            )
-        self.base_model = base_model
-        if refresh_cache:
-            self.base_model.refresh_cache = True
-
-    def download_hf_config(self):
-        """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
-        self.config_dir = os.path.join(
-            os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower()
-        )
-        self.config_path = os.path.join(self.config_dir, "config.json")
-        os.makedirs(self.config_dir, exist_ok=True)
-        print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
-        print(f"Saving {self.peft_model_id} configs to file {self.config_path}...")
-        with open(self.config_path, "w") as json_file:
-
-            class SetEncoder(json.JSONEncoder):
-                def default(self, obj):
-                    if isinstance(obj, set):
-                        return list(obj)
-                    return super().default(obj)
-
-            json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder)
-        
-        self.base_model.download_hf_config()
-
-    def __get_revision_hashes(self, peft_model_id: str, weights: bool):
-        model_name = self.peft_model_id
-        return self._LLM__get_revision_hashes(model_name, weights)
-
-    def convert_peft_model(self, hf_peft_model, weights_path):
-        for name, params in hf_peft_model.named_parameters():
-            if self.peft_type.lower() in name:
-                name = name.replace("base_model.model.model.", "").replace(
-                    ".default", ""
-                )
-                name = self.base_model.model_class.convert_hf_weight_name(name)
-                params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
-
-    def download_hf_weights_if_needed(self):
-        """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date.
-        If not, or if the refresh_cache parameter is set to True, download new weights.
-        """
-        self.base_model.download_hf_weights_if_needed()
-        
-        # Use local cache, or download new version
-        self.weights_path = os.path.join(
-            os.path.expanduser(self.cache_path),
-            "weights",
-            self.peft_model_id.lower(),
-            (
-                "full-precision"
-                if self.data_type == DataType.DT_FLOAT
-                else "half-precision"
-            ),
-        )
-        if self.refresh_cache:
-            print(
-                f"Refreshing weights in cache for model {self.peft_model_id} at path {self.weights_path} ..."
-            )
-            if os.path.exists(self.weights_path):
-                shutil.rmtree(self.weights_path)
-        os.makedirs(self.weights_path, exist_ok=True)
-        #print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
-
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.peft_model_id,
-            True
-        )
-
-        # Download if needed
-        if ff_revision != latest_revision:
-            if not os.path.exists(self.peft_model_id) or os.path.isdir(
-                self.peft_model_id
-            ):
-                print(
-                    f"'{self.peft_model_id}' model weights not found in cache or outdated. Downloading from huggingface.co ..."
-                )
-            else:
-                print(
-                    f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..."
-                )
-            hf_base_model = AutoModelForCausalLM.from_pretrained(
-                self.base_model.model_name,
-                return_dict=True,
-                trust_remote_code=True,
-                torch_dtype=(
-                    torch.float32
-                    if self.data_type == DataType.DT_FLOAT
-                    else torch.float16
-                ),
-                # device_map="auto",
-            )
-            hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id, config=self.hf_config)
-            # Print log message to notify user download of model has finished
-            if not os.path.exists(self.peft_model_id) or os.path.isdir(
-                self.peft_model_id
-            ):
-                print("Done downloading HF weights. Converting them now...")
-            # Convert the model to FlexFlow format
-            self.convert_peft_model(hf_peft_model, self.weights_path)
-            # Save new revision hash to file
-            with open(ff_revision_file, "w+") as f:
-                f.write(latest_revision)
-            print("Done converting the weights...")
-            # Deallocate hf model
-            del hf_peft_model
-            del hf_base_model
-            gc.collect()
-            torch.cuda.empty_cache()
-        else:
-            print(f"Loading '{self.peft_model_id}' model weights from the cache...")
-
-    def download_hf_tokenizer_if_needed(self):
-        self.base_model.download_hf_tokenizer_if_needed()
-
-    def compile(
-        self,
-        generation_config: GenerationConfig = GenerationConfig(),
-        max_requests_per_batch: int = 1,
-        max_seq_length: int = 256,
-        max_tokens_per_batch: int = 64,
-        model_specific_data_parallelism_degree: int = None,
-        model_specific_tensor_parallelism_degree: int = None,
-        model_specific_pipeline_parallelism_degree: int = None,
-        ssms: list = [],
-    ):
-        self.base_model.ssms = ssms
-        self.base_model.generation_config = GenerationConfig()
-        self.base_model.ffconfig = FFConfig()
-        if len(ssms) > 0:
-            assert type(self.base_model) == LLM
-            mode = InferenceMode.TREE_VERIFY_MODE
-        elif type(self.base_model) == SSM:
-            mode = InferenceMode.BEAM_SEARCH_MODE
-        else:
-            assert type(self.base_model) == LLM
-            mode = InferenceMode.INC_DECODING_MODE
-
-        # Apply model-specific parallelism degrees, if needed
-        if model_specific_data_parallelism_degree:
-            self.base_model.ffconfig.data_parallelism_degree = (
-                model_specific_data_parallelism_degree
-            )
-        if model_specific_tensor_parallelism_degree:
-            self.base_model.ffconfig.tensor_parallelism_degree = (
-                model_specific_tensor_parallelism_degree
-            )
-        if model_specific_pipeline_parallelism_degree:
-            self.base_model.ffconfig.pipeline_parallelism_degree = (
-                model_specific_pipeline_parallelism_degree
-            )
-
-        # Create request manager and set serving configuration
-        self.base_model.rm = RequestManager()
-        self.base_model.rm.set_max_requests_per_batch(max_requests_per_batch)
-        self.base_model.rm.set_max_tokens_per_batch(max_tokens_per_batch)
-        self.base_model.rm.set_max_sequence_length(max_seq_length)
-
-        # Instantiate the relevant model
-        self.base_model.model = self.model_class(
-            mode,
-            generation_config,
-            self.base_model.ffconfig,
-            self.base_model.hf_config,
-            self.base_model.data_type,
-            max_tokens_per_batch,
-        )
-
-        # TODO: add peft layers
-
-        # Download the weights from huggingface (if needed)
-        self.download_hf_weights_if_needed()
-
-        # Create file data loader, load weights into tensors
-        model_configs = self.base_model.config_class(self.base_model.hf_config)
-
-        self.fileloader = FileDataLoader(
-            self.weights_path,
-            model_configs.num_attention_heads,
-            model_configs.num_key_value_heads,
-            model_configs.hidden_size,
-            model_configs.hidden_size // model_configs.num_attention_heads,
-            self.ffconfig.tensor_parallelism_degree,
-            self.data_type == DataType.DT_FLOAT,
-        )
-
-        # Register weights file loader
-        self.im = InferenceManager()
-        self.im.register_model_weights_loader(self.model.ffmodel, self.fileloader)
-
-        # Download the tokenizer from huggingface (if needed) and load them
-        self.download_hf_tokenizer_if_needed()
-
-        # Create tokenizer (this must be done after we have downloaded the tokenizer
-        bos_token_id = (
-            -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id
-        )
-        eos_token_id = (
-            -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id
-        )
-        self.rm.register_tokenizer(
-            self.model_type, bos_token_id, eos_token_id, self.tokenizer_path
-        )
-        self.rm.register_output_filepath(self.output_file)
-
-        for ssm in self.ssms:
-            self.rm.register_ssm_model(ssm.model.ffmodel)
-
-        # start background server
-        if (mode == InferenceMode.TREE_VERIFY_MODE) or (
-            mode == InferenceMode.INC_DECODING_MODE
-        ):
-            import atexit
-
-            atexit.register(self.rm.stop_server)
-
-    def generate(self, prompts: Union[str, List[str]], max_length: int = 128):
-        super().generate(prompts, max_length)
-
-    def start_server(self):
-        self.base_model.start_server()
-    
-    def stop_server(self):
-        self.base_model.stop_server()
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 58acf3d010..60e33beb5e 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -67,6 +67,8 @@ class FFCObjectWrapper {
   FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_generation_result_t, GenerationResult *);
+  FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *);
+  FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *);
 };
 
 Logger ffc_log("flexflow_c");
@@ -1542,6 +1544,21 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
   return FFCObjectWrapper::wrap(tensor);
 }
 
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+    flexflow_model_t handle_,
+    const flexflow_lora_linear_config_t peft_config_) {
+  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+  LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_);
+  PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config);
+
+  DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, "
+              "peft_model_id: %p",
+              handle,
+              peft_config,
+              peft_model_id);
+  return FFCObjectWrapper::wrap(peft_model_id);
+}
+
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_,
                                       flexflow_sgd_optimizer_t optimizer_) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -2739,3 +2756,50 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
   FFModel *model = FFCObjectWrapper::unwrap(model_handle_);
   handle->load_weights(model);
 }
+
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+    flexflow_lora_linear_config_create(char const *cache_folder_,
+                                       char const *peft_model_id_) {
+  assert(cache_folder_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  assert(peft_model_id_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  std::string const cache_folder(cache_folder_);
+  std::string const peft_model_id(peft_model_id_);
+  LoraLinearConfig *handle = new LoraLinearConfig(cache_folder, peft_model_id);
+  DEBUG_PRINT("[LoraLinearConfig] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+void flexflow_lora_linear_config_destroy(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *peft_config = FFCObjectWrapper::unwrap(handle_);
+  DEBUG_PRINT("[LoraLinearConfig] delete %p", peft_config);
+  delete peft_config;
+}
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create() {
+  PEFTModelID *handle = new PEFTModelID();
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) {
+  PEFTModelID *handle = new PEFTModelID(id);
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) {
+  PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(handle_);
+  DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id);
+  delete peft_model_id;
+}
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 39934f4cce..170e087226 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -51,15 +51,15 @@ bool check_lora_layer_match(Layer *potential_target,
   return false;
 }
 
-PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
+PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
   assert(config.enable_peft &&
          "Cannot add a LoRA layer if PEFT mode is not enabled");
   if (peft_config.target_modules.size() == 0) {
     printf("PEFT config does not contain any target module\n");
-    return PEFTModelID::NO_ID;
+    return nullptr;
   }
-  PEFTModelID peft_model_id(peft_model_global_guid++);
-  peft_configs[peft_model_id] = peft_config;
+  PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++);
+  peft_configs[*peft_model_id] = peft_config;
 
   for (std::string target_module_name : peft_config.target_modules) {
     assert(target_module_name.length() > 0 &&
@@ -76,7 +76,7 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
           base_layer_to_peft_layer.end()) {
         // lora linear layer already added, no need to add again
         Layer *peft_layer = base_layer_to_peft_layer[target_module];
-        peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
+        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
       } else {
         Tensor const input = target_module->inputs[0];
         Tensor const output = target_module->outputs[0];
@@ -124,7 +124,7 @@ PEFTModelID FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
         ++it;
         base_layer_to_peft_layer[target_module] = peft_layer;
         peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
-        peft_layer_to_peft_id[peft_layer].push_back(peft_model_id);
+        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
       }
     }
   }

From 66573788643a0c669aff167ea8456c3f6099038c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 24 Mar 2024 22:50:49 +0000
Subject: [PATCH 25/32] update

---
 include/flexflow/ffconst.h            |   5 ++
 include/flexflow/request_manager.h    |   3 +-
 inference/peft/peft.cc                |  47 ++++++------
 python/flexflow/core/flexflow_cffi.py | 103 ++++++++++++++++++++++++--
 python/flexflow/serve/serve.py        |  45 ++---------
 python/flexflow/type.py               |   3 +
 src/c/flexflow_c.cc                   |  80 ++++++++++++++------
 src/runtime/request_manager.cc        |  22 ++++--
 8 files changed, 207 insertions(+), 101 deletions(-)

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 66e252db46..b16b9f9230 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -78,6 +78,11 @@ enum InferenceMode {
   TREE_VERIFY_MODE = 2003,
 };
 
+enum RequestType { 
+  REQ_INFERENCE = 4001,
+  REQ_FINETUNING = 4002,
+};
+
 // This is consistent with TASO's OpType
 // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138
 enum OperatorType {
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 0e59888888..0ef5efcf27 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -65,7 +65,6 @@ struct Request {
     COMPLETED = 103, // finished and verified
     FINISHING = 104, // finishing request, but not yet verified
   };
-  enum RequestType { REQ_INFERENCE = 201, REQ_FINETUNING = 202 };
   BatchConfig::RequestGuid guid;
   PEFTModelID peft_model_id = PEFTModelID::NO_ID;
   int max_sequence_length = 128;
@@ -81,7 +80,7 @@ struct Request {
   RequestType req_type = REQ_INFERENCE;
   int completed_training_steps = 0;
   int max_training_steps = 1;
-  std::vector<std::pair<std::string, std::string>> dataset_text;
+  std::string dataset_filepath;
   std::vector<std::pair<std::vector<BatchConfig::TokenId>,
                         std::vector<BatchConfig::TokenId>>>
       dataset;
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index aa5581ca87..687cd92699 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -291,6 +291,9 @@ void FlexFlow::top_level_task(Task const *task,
 
   int total_num_requests = 0;
   {
+    std::vector<Request> requests;
+    
+    // Add inference requests
     using json = nlohmann::json;
     std::ifstream file_handle(file_paths.prompt_file_path);
     assert(file_handle.good() && "Prompt file does not exist.");
@@ -298,28 +301,28 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
-
-    std::vector<Request> requests;
-    for (auto &prompt : prompt_json) {
-      std::string text = prompt.get<std::string>();
-      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
-      // Add inference request
-      // Request inference_req;
-      // inference_req.prompt = text;
-      // inference_req.max_sequence_length = 128;
-      // inference_req.peft_model_id = peft_model_id;
-      // requests.push_back(inference_req);
-      // total_num_requests++;
-      // Add fine-tuning request
-      Request fine_tuning_req;
-      fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
-      fine_tuning_req.max_sequence_length = 128;
-      fine_tuning_req.peft_model_id =
-          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
-      fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
-      requests.push_back(fine_tuning_req);
-      total_num_requests++;
-    }
+    // for (auto &prompt : prompt_json) {
+    //   std::string text = prompt.get<std::string>();
+    //   printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+    //   Request inference_req;
+    //   inference_req.prompt = text;
+    //   inference_req.max_sequence_length = 128;
+    //   inference_req.peft_model_id = peft_model_id;
+    //   requests.push_back(inference_req);
+    //   total_num_requests++;
+    // }
+    
+    // Add fine-tuning request
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.max_sequence_length = 128;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.dataset_filepath = file_paths.prompt_file_path;
+    fine_tuning_req.max_training_steps = 1;
+    requests.push_back(fine_tuning_req);
+    total_num_requests++;
+    
     std::vector<GenerationResult> result = model.generate(requests);
   }
 
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index ef0ee0e378..ccb50dd566 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -28,6 +28,7 @@
     CompMode,
     MetricsType,
     InferenceMode,
+    RequestType,
     ModelType,
     OpType,
     ParameterSyncType,
@@ -36,7 +37,7 @@
 )
 from flexflow.config import *
 from .flexflowlib import ffi, flexflow_library
-
+from typing import Union, List
 
 def ffc():
     if not flexflow_already_initialized():
@@ -3823,27 +3824,57 @@ def get_output_tensor(self, ffmodel, data_type):
         assert ret_val == True
         return np_array
 
-    def generate(self, prompt_list, max_sequence_length):
+    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128):
         assert isinstance(prompt_list, list)
         c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
         max_num_chars = 5 * (max_sequence_length + 100)
         c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
         c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list]
+        c_request_types = [enum_to_int(RequestType, RequestType.REQ_INFERENCE) for prompt in prompt_list]
+        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
+        peft_model_ids = [None for prompt in prompt_list]
+        dataset_filepaths = [None for prompt in prompt_list]
+        training_steps = [0 for prompt in prompt_list]
         ffc().flexflow_model_generate(
             self.handle,
             len(prompt_list),
+            c_request_types,
             c_input_texts,
             max_num_chars,
             c_output_texts,
-            max_sequence_length,
+            max_sequence_lengths,
+            peft_model_ids,
+            dataset_filepaths,
+            training_steps,
             c_output_length_and_tokens,
         )
-        #output_length = c_output_length_and_tokens[0]
-        #output_tokens = []
-        #for i in range(output_length):
-        #    output_tokens.append(c_output_length_and_tokens[i + 1])
         from flexflow.serve import GenerationResult
-
+        return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts]
+    
+    def generate(self, requests_list: List[Request]):
+        assert isinstance(requests_list, list)
+        c_input_texts = [get_c_name(request.prompt) for request in requests_list]
+        max_num_chars = 5 * (max_sequence_length + 100)
+        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
+        c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list]
+        c_request_types = [enum_to_int(RequestType, RequestType.REQ_INFERENCE) for prompt in prompt_list]
+        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
+        peft_model_ids = [None for prompt in prompt_list]
+        dataset_filepaths = [None for prompt in prompt_list]
+        training_steps = [0 for prompt in prompt_list]
+        ffc().flexflow_model_generate(
+            self.handle,
+            len(prompt_list),
+            c_request_types,
+            c_input_texts,
+            max_num_chars,
+            c_output_texts,
+            max_sequence_lengths,
+            peft_model_ids,
+            dataset_filepaths,
+            training_steps,
+            c_output_length_and_tokens,
+        )
         return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts]
 
     def set_position_offset(self, offset):
@@ -4288,6 +4319,47 @@ def load_weights(self, model):
             self.handle, model.handle
         )
 
+# -----------------------------------------------------------------------
+# GenerationConfig
+# -----------------------------------------------------------------------
+        
+class GenerationConfig(object):
+    """A class to store the sampling configs."""
+
+    def __init__(
+        self,
+        do_sample: bool = False,
+        temperature: float = 0.9,
+        topp: float = 0.8,
+        topk: int = 1,
+    ):
+        """Initialize the sampling configs
+
+        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
+        :type do_sample: bool, optional
+        :param temperature: The temperature setting, defaults to 0.9
+        :type temperature: float, optional
+        :param topp: The top probabilities (top-p) setting, defaults to 0.8
+        :type topp: float, optional
+        :param topk: The top-k setting, defaults to 1
+        :type topk: int, optional
+        """
+        self.do_sample = do_sample
+        self.temperature = temperature
+        self.topp = topp
+        self.topk = topk
+
+# -----------------------------------------------------------------------
+# GenerationResult
+# -----------------------------------------------------------------------
+
+class GenerationResult(object):
+    """A class to store the output of a generation request."""
+
+    def __init__(self, text: str = None, tokens: list = None):
+        self.output_text = text
+        self.output_tokens = tokens
+
 # -----------------------------------------------------------------------
 # LoraLinearConfig
 # -----------------------------------------------------------------------
@@ -4321,3 +4393,18 @@ def __init__(self, id=None):
         else:
             self.handle = ffc().flexflow_peft_model_id_create_id(id)
         self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy)
+
+# -----------------------------------------------------------------------
+# Request
+# -----------------------------------------------------------------------
+        
+class Request:
+    """A class to record the metadata of an inference or finetuning request."""
+
+    def __init__(self, req_type: RequestType, prompt: str = None, max_sequence_length: int = None, peft_model_id: PEFTModelID = None, dataset_filepath: str = None, max_training_steps: int = None):
+        self.req_type = req_type
+        self.prompt = prompt
+        self.max_sequence_length = max_sequence_length
+        self.peft_model_id = peft_model_id
+        self.dataset_filepath = dataset_filepath
+        self.max_training_steps = max_training_steps
\ No newline at end of file
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index bc7a796315..b38f0b574f 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -34,41 +34,6 @@
 from typing import Union, List
 
 
-class GenerationConfig:
-    """A class to store the sampling configs."""
-
-    def __init__(
-        self,
-        do_sample: bool = False,
-        temperature: float = 0.9,
-        topp: float = 0.8,
-        topk: int = 1,
-    ):
-        """Initialize the sampling configs
-
-        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
-        :type do_sample: bool, optional
-        :param temperature: The temperature setting, defaults to 0.9
-        :type temperature: float, optional
-        :param topp: The top probabilities (top-p) setting, defaults to 0.8
-        :type topp: float, optional
-        :param topk: The top-k setting, defaults to 1
-        :type topk: int, optional
-        """
-        self.do_sample = do_sample
-        self.temperature = temperature
-        self.topp = topp
-        self.topk = topk
-
-
-class GenerationResult:
-    """A class to store the output of a generation request."""
-
-    def __init__(self, text: str = None, tokens: list = None):
-        self.output_text = text
-        self.output_tokens = tokens
-
-
 class _SupportedModels:
     def __init__(
         self,
@@ -467,22 +432,22 @@ def compile(
 
             atexit.register(self.rm.stop_server)
 
-    def generate(self, prompts: Union[str, List[str]], max_length: int = 128):
+    def generate(self, prompts: Union[str, List[str], Request, List[Request]], max_length: int = 128):
         """Generate tokens based on the input prompt(s)
 
-        :param prompts: The generation prompt(s) in the form of a string, or list of strings
-        :type prompts: Union[str, List[str]]
+        :param prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests
+        :type prompts: Union[str, List[str], Request, List[Request]]
         :return: the generation results
         :rtype: GenerationResult
         """
         if type(prompts) == str:
             if len(prompts) == 0:
                 return None
-            return self.model.ffmodel.generate([prompts], max_length)
+            return self.model.ffmodel.generate_inf_only([prompts], max_length)
         elif type(prompts) == list:
             if len(prompts) == 0:
                 return []
-            return self.model.ffmodel.generate(prompts, max_length)
+            return self.model.ffmodel.generate_inf_only(prompts, max_length)
         else:
             assert False, "Please pass a non-empty string or list of strings"
 
diff --git a/python/flexflow/type.py b/python/flexflow/type.py
index 994a85f57e..ac6975b4fd 100644
--- a/python/flexflow/type.py
+++ b/python/flexflow/type.py
@@ -152,6 +152,9 @@ class OpType(Enum):
     RESIDUAL_RMS_NORM = 2305
     RESIDUAL_LAYERNORM = 2306
 
+class RequestType(Enum):
+    REQ_INFERENCE = 4001
+    REQ_FINETUNING = 4002
 
 def enum_to_int(enum, enum_item):
     for item in enum:
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 60e33beb5e..d592cdd3ee 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1614,43 +1614,75 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) {
 
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
+                             enum RequestType *request_types,
                              char const **input_texts,
                              int max_num_chars,
                              char **output_texts,
-                             int max_seq_length,
+                             int *max_seq_lengths,
+                             flexflow_peft_model_id_t *peft_model_ids,
+                             char const **dataset_filepaths,
+                             int *training_steps;
                              int **output_length_and_tokens) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   std::vector<Request> requests;
+
+  int finetuning_req_idx = 0; 
   for (int i = 0; i < num_requests; i++) {
-    std::string const text_str(input_texts[i]);
-    Request inference_req;
-    inference_req.prompt = text_str;
-    inference_req.max_sequence_length = max_seq_length;
-    requests.push_back(inference_req);
-    DEBUG_PRINT("[Model] generate[%d] %p %s %i",
-                i,
-                handle,
-                text_str.c_str(),
-                max_seq_length);
+    if (request_types[i] == RequestType::REQ_INFERENCE) {
+      std::string const text_str(input_texts[i]);
+      Request inference_req;
+      inference_req.prompt = text_str;
+      inference_req.max_sequence_length = max_seq_lengths[i];
+      if (peft_model_ids[i] != nullptr) {
+        PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+        inference_req.peft_model_id = *peft_model_id;
+      }
+      requests.push_back(inference_req);
+      DEBUG_PRINT("[Model] generate[%d] %p %s %i",
+                  i,
+                  handle,
+                  text_str.c_str(),
+                  max_seq_lengths[i]);
+    } else {
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.max_sequence_length = max_seq_lengths[i];
+      if (peft_model_ids[i] != nullptr) {
+        PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+        fine_tuning_req.peft_model_id = *peft_model_id;
+      }
+      std::string const dataset_fp(dataset_filepaths[finetuning_req_idx]);
+      fine_tuning_req.dataset_filepath = dataset_fp;
+      fine_tuning_req.max_training_steps = training_steps[finetuning_req_idx];
+      requests.push_back(finetuning_req_idx);
+      DEBUG_PRINT("[Model] generate[%d] %p %s %i %i",
+                  i,
+                  handle,
+                  dataset_fp.c_str(),
+                  max_seq_lengths[i],
+                  training_steps[finetuning_req_idx]);
+      finetuning_req_idx++;
+    }
   }
 
   std::vector<GenerationResult> results = handle->generate(requests);
 
-  // If the prompt exceeds max seq len, check that we return the prompt with no
-  // additional token. Otherwise, check that the output does not exceed the max
-  // sequence length.
   for (int i = 0; i < num_requests; i++) {
-    assert(results[i].output_tokens.size() <= max_seq_length ||
-           results[i].output_tokens.size() == results[i].input_tokens.size());
-    output_length_and_tokens[i][0] = results[i].output_tokens.size();
-    std::copy(results[i].output_tokens.begin(),
-              results[i].output_tokens.end(),
-              output_length_and_tokens[i] + 1);
-    std::memcpy(output_texts[i],
-                results[i].output_text.c_str(),
-                results[i].output_text.length());
+    if (request_types[i] == RequestType::REQ_INFERENCE) {
+      // If the prompt exceeds max seq len, check that we return the prompt with no
+      // additional token. Otherwise, check that the output does not exceed the max
+      // sequence length.
+      assert(results[i].output_tokens.size() <= max_seq_length ||
+            results[i].output_tokens.size() == results[i].input_tokens.size());
+      output_length_and_tokens[i][0] = results[i].output_tokens.size();
+      std::copy(results[i].output_tokens.begin(),
+                results[i].output_tokens.end(),
+                output_length_and_tokens[i] + 1);
+      std::memcpy(output_texts[i],
+                  results[i].output_text.c_str(),
+                  results[i].output_text.length());
+    }
   }
-  // return FFCObjectWrapper::wrap(&results[0]);
 }
 
 void flexflow_model_set_position_offset(flexflow_model_t handle_,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c335cd246b..8fb040fb6d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -244,15 +244,27 @@ RequestManager::RequestGuid
   request.peft_model_id = request_.peft_model_id;
   request.req_type = Request::REQ_FINETUNING;
   request.completed_training_steps = 0;
-  request.max_training_steps = 1; // TODO: let user set this
-  for (auto const &sample : request_.dataset_text) {
+  request.max_training_steps = request_.max_training_steps;
+  request.dataset_filepath = request_.dataset_filepath;
+
+  // Load dataset
+  using json = nlohmann::json;
+  std::ifstream file_handle(request.dataset_filepath);
+  assert(file_handle.good() && "Dataset file does not exist.");
+  json dataset_json = json::parse(file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+
+  for (auto &prompt : dataset_json) {
+    std::string text = prompt.get<std::string>();
+    std::string output_text("");
     std::vector<int32_t> input_tokens;
-    input_tokens = this->tokenizer_->Encode(sample.first);
+    input_tokens = this->tokenizer_->Encode(text);
     if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
       input_tokens.insert(input_tokens.begin(), bos_token_id);
     }
-    std::vector<int32_t> output_tokens =
-        this->tokenizer_->Encode(sample.second);
+    std::vector<int32_t> output_tokens = this->tokenizer_->Encode(output_text);
     if (input_tokens.size() + output_tokens.size() >
         get_max_sequence_length()) {
       std::cout << "Warning: too many tokens in sample, only load up to "

From 7b8a9ee31cd0e51150a13251269689307feb925c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 25 Mar 2024 00:51:27 +0000
Subject: [PATCH 26/32] update

---
 python/flexflow/core/flexflow_cffi.py | 23 ++++++++++-------------
 src/c/flexflow_c.cc                   |  1 -
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index ccb50dd566..981f2be9ef 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -3840,7 +3840,6 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 1
             len(prompt_list),
             c_request_types,
             c_input_texts,
-            max_num_chars,
             c_output_texts,
             max_sequence_lengths,
             peft_model_ids,
@@ -3853,21 +3852,19 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 1
     
     def generate(self, requests_list: List[Request]):
         assert isinstance(requests_list, list)
-        c_input_texts = [get_c_name(request.prompt) for request in requests_list]
-        max_num_chars = 5 * (max_sequence_length + 100)
-        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
-        c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list]
-        c_request_types = [enum_to_int(RequestType, RequestType.REQ_INFERENCE) for prompt in prompt_list]
-        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
-        peft_model_ids = [None for prompt in prompt_list]
-        dataset_filepaths = [None for prompt in prompt_list]
-        training_steps = [0 for prompt in prompt_list]
+        c_input_texts = [get_c_name(request.prompt) for request in requests_list] # entry will be None for finetuning requests
+        c_output_texts = [ffi.new("char[]", 5 * (request.max_sequence_length + 100)) if request.req_type == RequestType.REQ_INFERENCE else ffi.NULL for request in requests_list]
+        c_output_length_and_tokens = [ffi.new("int[]", request.max_sequence_length + 100) for request in requests_list]
+        c_request_types = [enum_to_int(RequestType, request.req_type) for request in requests_list]
+        max_sequence_lengths = [request.max_sequence_length for request in requests_list]
+        peft_model_ids = [request.peft_model_id for request in requests_list]
+        dataset_filepaths = [request.dataset_filepath for request in requests_list]
+        training_steps = [request.max_training_steps for request in requests_list]
         ffc().flexflow_model_generate(
             self.handle,
-            len(prompt_list),
+            len(requests_list),
             c_request_types,
             c_input_texts,
-            max_num_chars,
             c_output_texts,
             max_sequence_lengths,
             peft_model_ids,
@@ -3875,7 +3872,7 @@ def generate(self, requests_list: List[Request]):
             training_steps,
             c_output_length_and_tokens,
         )
-        return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts]
+        return [GenerationResult(ffi.string(c_output_text), []) if c_output_text != ffi.NULL else None for c_output_text in c_output_texts]
 
     def set_position_offset(self, offset):
         ffc().flexflow_model_set_position_offset(self.handle, offset)
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index d592cdd3ee..44fdd5af4e 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1616,7 +1616,6 @@ void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
                              enum RequestType *request_types,
                              char const **input_texts,
-                             int max_num_chars,
                              char **output_texts,
                              int *max_seq_lengths,
                              flexflow_peft_model_id_t *peft_model_ids,

From 22d4d8ef112ef5291fb78fb2e523027c541df84b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 25 Mar 2024 03:43:40 +0000
Subject: [PATCH 27/32] update

---
 python/flexflow/serve/serve.py | 99 ++++++++++++++++++++++++----------
 1 file changed, 70 insertions(+), 29 deletions(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index b38f0b574f..a02facc356 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -120,7 +120,9 @@ def add_peft(self, peft_model_id: str):
                 f"PEFT model {peft_model_id} does not have an associated base model"
             )
         if peft_config.base_model_name_or_path != self.model_name:
-            raise RuntimeError(f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}")
+            raise RuntimeError(
+                f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}"
+            )
         ff_peft_config = LoraLinearConfig(self.cache_path, peft_model_id)
         peft_dict = {
             "peft_config": peft_config,
@@ -139,24 +141,30 @@ def download_hf_config(self):
         print(f"Creating directory {config_dir} (if it doesn't exist)...")
         print(f"Saving {self.model_name} configs to file {config_path}...")
         self.hf_config.to_json_file(config_path)
-        
+
         # Save PEFT configs if the LLM has any registered PEFTs
         for peft_model_id, peft_dict in self.pefts.items():
             peft_config = peft_dict["hf_config"]
-            peft_config_path = os.path.join(os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower())
+            peft_config_path = os.path.join(
+                os.path.expanduser(self.cache_path),
+                "configs",
+                self.peft_model_id.lower(),
+            )
             print(f"Saving {peft_model_id} configs to file {peft_config_path}...")
             with open(peft_config_path, "w") as json_file:
+
                 class SetEncoder(json.JSONEncoder):
                     def default(self, obj):
                         if isinstance(obj, set):
                             return list(obj)
                         return super().default(obj)
+
                 json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder)
 
     def __get_revision_hashes(self, model_name: str, folder: str):
         ff_revision = None
         ff_revision_file = os.path.join(folder, "rev_sha.txt")
-            
+
         if os.path.exists(ff_revision_file):
             ff_revision = "".join(open(ff_revision_file).read().split())
 
@@ -179,8 +187,12 @@ def download_hf_weights_if_needed(self):
 
         If any PEFT adapter is registered, perform the same operation for PEFT.
         """
+
         def get_weights_path(model_name):
-            return os.path.join(os.path.expanduser(self.cache_path), "weights", model_name.lower(),
+            return os.path.join(
+                os.path.expanduser(self.cache_path),
+                "weights",
+                model_name.lower(),
                 (
                     "full-precision"
                     if self.data_type == DataType.DT_FLOAT
@@ -197,7 +209,7 @@ def refresh_cache_if_needed(model_name):
                 if os.path.exists(weights_path):
                     shutil.rmtree(weights_path)
             os.makedirs(weights_path, exist_ok=True)
-        
+
         def get_hf_llm(model_name):
             return AutoModelForCausalLM.from_pretrained(
                 model_name,
@@ -208,13 +220,17 @@ def get_hf_llm(model_name):
                     else torch.float16
                 ),
             )
-        
+
         def download_llm_weights():
             weights_path = get_weights_path(self.model_name)
             refresh_cache_if_needed(self.model_name)
-            ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights_path)
+            ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+                self.model_name, weights_path
+            )
             if ff_revision != latest_revision:
-                print(f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now...")
+                print(
+                    f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..."
+                )
                 hf_model = get_hf_llm(self.model_name)
                 # Convert the model to FlexFlow format
                 self.model_class.convert_hf_model(hf_model, weights_path)
@@ -226,7 +242,7 @@ def download_llm_weights():
                 del hf_model
                 gc.collect()
                 torch.cuda.empty_cache()
-        
+
         def convert_peft_model(hf_peft_model, peft_type, weights_path):
             for name, params in hf_peft_model.named_parameters():
                 if peft_type.lower() in name:
@@ -235,20 +251,26 @@ def convert_peft_model(hf_peft_model, peft_type, weights_path):
                     )
                     name = self.model_class.convert_hf_weight_name(name)
                     params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
-        
+
         def download_peft_weights():
             for peft_model_id, peft_dict in self.pefts.items():
                 peft_config = peft_dict["peft_config"]
                 peft_type = peft_config["peft_type"]
-                
+
                 weights_path = get_weights_path(peft_model_id)
                 refresh_cache_if_needed(peft_model_id)
-                ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(peft_model_id, weights_path)
-                
+                ff_revision, ff_revision_file, latest_revision = (
+                    self.__get_revision_hashes(peft_model_id, weights_path)
+                )
+
                 if ff_revision != latest_revision:
-                    print(f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now...")
+                    print(
+                        f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..."
+                    )
                     hf_model = get_hf_llm(peft_model_id)
-                    hf_peft_model = PeftModel.from_pretrained(hf_model, peft_model_id, config=peft_config)
+                    hf_peft_model = PeftModel.from_pretrained(
+                        hf_model, peft_model_id, config=peft_config
+                    )
                     # Convert the model to FlexFlow format
                     convert_peft_model(hf_peft_model, peft_type, weights_path)
                     # Save new revision hash to file
@@ -260,7 +282,7 @@ def download_peft_weights():
                     del hf_model
                     gc.collect()
                     torch.cuda.empty_cache()
-        
+
         download_llm_weights()
         download_peft_weights()
 
@@ -277,7 +299,9 @@ def download_hf_tokenizer_if_needed(self):
             self.model_name.lower(),
         )
         if self.refresh_cache:
-            print(f"Refreshing cached tokenizer for model {self.model_name} at path {tokenizer_path} ...")
+            print(
+                f"Refreshing cached tokenizer for model {self.model_name} at path {tokenizer_path} ..."
+            )
             if os.path.exists(tokenizer_path):
                 shutil.rmtree(tokenizer_path)
         if not os.path.exists(tokenizer_path):
@@ -285,10 +309,14 @@ def download_hf_tokenizer_if_needed(self):
             os.makedirs(tokenizer_path, exist_ok=True)
 
         # Get local revision SHA, check if it matches latest one on huggingface
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, tokenizer_path)
+        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+            self.model_name, tokenizer_path
+        )
 
         if ff_revision != latest_revision:
-            print(f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now...")
+            print(
+                f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
+            )
             # Download tokenizer from HuggingFace, or load it from the local folder
             if self.model_type == ModelType.LLAMA:
                 hf_tokenizer = LlamaTokenizer.from_pretrained(
@@ -432,22 +460,35 @@ def compile(
 
             atexit.register(self.rm.stop_server)
 
-    def generate(self, prompts: Union[str, List[str], Request, List[Request]], max_length: int = 128):
+    def generate(
+        self,
+        requests_or_prompts: Union[str, List[str], Request, List[Request]],
+        max_length: int = 128,
+    ):
         """Generate tokens based on the input prompt(s)
 
-        :param prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests
-        :type prompts: Union[str, List[str], Request, List[Request]]
+        :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests
+        :type requests_or_prompts: Union[str, List[str], Request, List[Request]]
         :return: the generation results
         :rtype: GenerationResult
         """
-        if type(prompts) == str:
-            if len(prompts) == 0:
+        if type(requests_or_prompts) == str:
+            if len(requests_or_prompts) == 0:
                 return None
-            return self.model.ffmodel.generate_inf_only([prompts], max_length)
-        elif type(prompts) == list:
-            if len(prompts) == 0:
+            return self.model.ffmodel.generate_inf_only(
+                [requests_or_prompts], max_length
+            )
+        elif type(requests_or_prompts) == Request:
+            return self.model.ffmodel.generate(requests_or_prompts)
+        elif type(requests_or_prompts) == list:
+            if len(requests_or_prompts) == 0:
                 return []
-            return self.model.ffmodel.generate_inf_only(prompts, max_length)
+            if type(requests_or_prompts[0]) == str:
+                return self.model.ffmodel.generate_inf_only(
+                    requests_or_prompts, max_length
+                )
+            else:
+                return self.model.ffmodel.generate(requests_or_prompts)
         else:
             assert False, "Please pass a non-empty string or list of strings"
 

From dd971e778c766651d1c3bd76a44a4845811145a3 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 25 Mar 2024 19:19:30 +0000
Subject: [PATCH 28/32] updates

---
 include/flexflow/ffconst.h            |    2 +-
 include/flexflow/flexflow_c.h         |   11 +-
 inference/peft/peft.cc                |    6 +-
 inference/python/ff_peft.py           |  146 +
 python/flexflow/core/flexflow_cffi.py | 4862 +++++++++++++------------
 python/flexflow/serve/__init__.py     |   11 +-
 python/flexflow/serve/serve.py        |    6 +-
 src/c/flexflow_c.cc                   |   24 +-
 src/runtime/request_manager.cc        |   20 +-
 tests/peft_test.sh                    |    4 +
 10 files changed, 2660 insertions(+), 2432 deletions(-)
 create mode 100644 inference/python/ff_peft.py

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index b16b9f9230..016dd7bdd1 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -78,7 +78,7 @@ enum InferenceMode {
   TREE_VERIFY_MODE = 2003,
 };
 
-enum RequestType { 
+enum RequestType {
   REQ_INFERENCE = 4001,
   REQ_FINETUNING = 4002,
 };
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 1ceea59839..8150e05dd1 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -618,10 +618,13 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
 
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
-                             char const **input_text,
-                             int max_num_chars,
-                             char **output_text,
-                             int max_seq_length,
+                             enum RequestType *request_types,
+                             char const **input_texts,
+                             char **output_texts,
+                             int *max_seq_lengths,
+                             flexflow_peft_model_id_t *peft_model_ids,
+                             char const **dataset_filepaths,
+                             int *training_steps,
                              int **output_length_and_tokens);
 
 void flexflow_model_set_position_offset(flexflow_model_t handle, int offset);
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 687cd92699..eade2eaeeb 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -292,7 +292,7 @@ void FlexFlow::top_level_task(Task const *task,
   int total_num_requests = 0;
   {
     std::vector<Request> requests;
-    
+
     // Add inference requests
     using json = nlohmann::json;
     std::ifstream file_handle(file_paths.prompt_file_path);
@@ -311,7 +311,7 @@ void FlexFlow::top_level_task(Task const *task,
     //   requests.push_back(inference_req);
     //   total_num_requests++;
     // }
-    
+
     // Add fine-tuning request
     Request fine_tuning_req;
     fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
@@ -322,7 +322,7 @@ void FlexFlow::top_level_task(Task const *task,
     fine_tuning_req.max_training_steps = 1;
     requests.push_back(fine_tuning_req);
     total_num_requests++;
-    
+
     std::vector<GenerationResult> result = model.generate(requests);
   }
 
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
new file mode 100644
index 0000000000..18ef8bbf33
--- /dev/null
+++ b/inference/python/ff_peft.py
@@ -0,0 +1,146 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flexflow.serve as ff
+import argparse, json, os
+from types import SimpleNamespace
+
+
+def get_configs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-config-file",
+        help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+        type=str,
+        default="",
+    )
+    args = parser.parse_args()
+
+    # Load configs from JSON file (if specified)
+    if len(args.config_file) > 0:
+        if not os.path.isfile(args.config_file):
+            raise FileNotFoundError(f"Config file {args.config_file} not found.")
+        try:
+            with open(args.config_file) as f:
+                return json.load(f)
+        except json.JSONDecodeError as e:
+            print("JSON format error:")
+            print(e)
+    else:
+        # Define sample configs
+        ff_init_configs = {
+            # required parameters
+            "num_gpus": 2,
+            "memory_per_gpu": 14000,
+            "zero_copy_memory_per_node": 40000,
+            # optional parameters
+            "num_cpus": 4,
+            "legion_utility_processors": 4,
+            "data_parallelism_degree": 1,
+            "tensor_parallelism_degree": 1,
+            "pipeline_parallelism_degree": 2,
+            "offload": False,
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
+            "use_4bit_quantization": False,
+            "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
+            "profiling": False,
+            "inference_debugging": False,
+            "fusion": True,
+        }
+        model_configs = {
+            # required parameters
+            "base_model": "JackFram/llama-160m",
+            "peft_model_ids": [
+                "goliaro/llama-160m-lora-full",
+            ],
+            # optional parameters
+            "cache_path": "",
+            "refresh_cache": False,
+            "full_precision": False,
+            "prompt": "",
+            "finetuning_dataset": os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "../prompt/peft.json"
+            ),
+            "output_file": "",
+        }
+        # Merge dictionaries
+        ff_init_configs.update(model_configs)
+        return ff_init_configs
+
+
+def main():
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+
+    # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+    ff.init(configs_dict)
+
+    # Create the FlexFlow LLM
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.base_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+        output_file=configs.output_file,
+    )
+    for peft_model_id in configs.peft_model_ids:
+        llm.add_peft(peft_model_id)
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=1,
+        max_seq_length=256,
+        max_tokens_per_batch=64,
+    )
+
+    llm.start_server()
+
+    requests = []
+    # Serving
+    if len(configs.prompt) > 0:
+        prompts = [s for s in json.load(open(configs.prompt))]
+        inference_requests = [
+            Request(RequestType.REQ_INFERENCE, prompt=prompt, max_sequence_length=128)
+            for prompt in prompts
+        ]
+        requests += inference_requests
+    # Finetuning
+    if len(configs.finetuning_dataset) > 0:
+        for peft_model_id in configs.peft_model_ids:
+            finetuning_request = Request(
+                RequestType.REQ_FINETUNING,
+                max_sequence_length=128,
+                peft_model_id=peft_model_id,
+                dataset_filepath=configs.finetuning_dataset,
+            )
+            requests.append(finetuning_request)
+
+    llm.generate(requests)
+
+    llm.stop_server()
+
+
+if __name__ == "__main__":
+    print("flexflow PEFT example")
+    main()
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 981f2be9ef..aa762fc1af 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -39,6 +39,7 @@
 from .flexflowlib import ffi, flexflow_library
 from typing import Union, List
 
+
 def ffc():
     if not flexflow_already_initialized():
         raise RuntimeError("Cannot use FlexFlow library before initializing FlexFlow")
@@ -1244,650 +1245,646 @@ def get_weights(self, ffmodel):
 
 
 # -----------------------------------------------------------------------
-# FFModel
+# SGDOptimizer
 # -----------------------------------------------------------------------
 
 
-class FFModel(object):
-    """ """
+class SGDOptimizer(object):
+    __slots__ = ["handle", "_handle"]
 
-    __slots__ = [
-        "handle",
-        "_handle",
-        "_layers",
-        "_nb_layers",
-        "_ffconfig",
-        "_tracing_id",
-        "initializers",
-        "attr_tensors",
-    ]
+    def __init__(
+        self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0
+    ):
+        self.handle = ffc().flexflow_sgd_optimizer_create(
+            ffmodel.handle, lr, momentum, nesterov, weight_decay
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy)
 
-    def __init__(self, ffconfig):
-        """Constructor of FFModel.
+    def set_learning_rate(self, learning_rate):
+        ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate)
 
-        :param ffconfig: configurations of FlexFlow and the created model.
-        :type ffconfig: FFConfig
 
-        :returns:  FFModel -- the model.
-        """
-        self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload)
-        self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy)
-        self._layers = dict()
-        self._nb_layers = 0
-        self._ffconfig = ffconfig
-        global ff_tracing_id
-        self._tracing_id = ff_tracing_id
-        ff_tracing_id += 1
-        self.initializers = {}
-        self.attr_tensors = {}
+# -----------------------------------------------------------------------
+# AdamOptimizer
+# -----------------------------------------------------------------------
 
-    def get_layers(self):
-        return self._layers
 
-    def add_layer(self, op_type, name):
-        layer_id = self._nb_layers
-        op_handle = ffc().flexflow_model_get_last_layer(self.handle)
-        self._layers[self._nb_layers] = convert_op_handle_to_op(
-            op_type, op_handle, idx=layer_id, name=name
+class AdamOptimizer(object):
+    __slots__ = ["handle", "_handle"]
+
+    def __init__(
+        self,
+        ffmodel,
+        alpha=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        weight_decay=0.0,
+        epsilon=1e-8,
+    ):
+        self.handle = ffc().flexflow_adam_optimizer_create(
+            ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon
         )
-        self._nb_layers += 1
+        self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy)
 
-    def create_tensor(self, dims, data_type, create_grad=True):
-        """Instantiate a FlexFlow tensor.
+    def set_learning_rate(self, learning_rate):
+        ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate)
 
-        :param x: a shape tuple/list (integers), including the batch size.
-        :type x: list of int
 
-        :param data_type: the datatype of the created tensor. Options are
-          DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN.
-        :type data_type: DataType
+# -----------------------------------------------------------------------
+# Initializer
+# -----------------------------------------------------------------------
+class Initializer(object):
+    __slots__ = ["handle", "p_handle"]
 
-        :param create_grad: weather the tensor creates a gradients vector.
-          If you don't specify anything, a gradients vector is used.
-        :type create_grad: bool
+    def __init__(self, handle, p_handle=0):
+        self.p_handle = ffi.new("flexflow_initializer_t *")
+        if handle == None:
+            self.p_handle.impl = ffi.NULL
+        else:
+            self.p_handle.impl = handle.impl
+        self.handle = self.p_handle[0]
+        assert ffi.typeof(self.handle) == ffi.typeof(
+            "flexflow_initializer_t"
+        ), "Initializer handle is wrong"
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_dims = ffi.new("int[]", dims)
-        c_data_type = enum_to_int(DataType, data_type)
-        num_dims = len(dims)
-        handle = ffc().flexflow_tensor_create(
-            self.handle, num_dims, c_dims, c_data_type, create_grad
-        )
-        return Tensor(handle)
 
-    def map_tensor(self, tensor, parallel_op=None):
-        op_handle = self.__get_op_handle(parallel_op)
-        ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle)
+# -----------------------------------------------------------------------
+# GlorotUniform
+# -----------------------------------------------------------------------
 
-    def create_constant(self, dims, value, data_type):
-        c_dims = ffi.new("int[]", dims)
-        c_data_type = enum_to_int(DataType, data_type)
-        num_dims = len(dims)
-        handle = ffc().flexflow_constant_create(
-            self.handle, num_dims, c_dims, value, c_data_type
-        )
-        return Tensor(handle)
 
-    def exp(self, x, name=None):
-        """Exponential activation function.
+class GlorotUniformInitializer(Initializer):
+    __slots__ = ["glorot_handle", "_glorot_handle"]
 
-        :param x: the input Tensor.
-        :type x: Tensor
+    def __init__(self, seed):
+        self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed)
+        self._glorot_handle = ffi.gc(
+            self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy
+        )
+        super(GlorotUniformInitializer, self).__init__(self.glorot_handle)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name)
-        self.add_layer(OpType.EXP, name)
-        return Tensor(handle, owner_op_type=OpType.EXP)
+# -----------------------------------------------------------------------
+# ZeroInitializer
+# -----------------------------------------------------------------------
 
-    def sin(self, x, name=None):
-        """Elementwise sine function.
 
-        :param x: the input Tensor.
-        :type x: Tensor
+class ZeroInitializer(Initializer):
+    __slots__ = ["zero_handle", "_zero_handle"]
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def __init__(self):
+        self.zero_handle = ffc().flexflow_zero_initializer_create()
+        self._zero_handle = ffi.gc(
+            self.zero_handle, ffc().flexflow_zero_initializer_destroy
+        )
+        super(ZeroInitializer, self).__init__(self.zero_handle)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name)
-        self.add_layer(OpType.SIN, name)
-        return Tensor(handle, owner_op_type=OpType.SIN)
 
-    def cos(self, x, name=None):
-        """Elementwise cosine function.
+# -----------------------------------------------------------------------
+# UniformInitializer
+# -----------------------------------------------------------------------
 
-        :param x: the input Tensor.
-        :type x: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class UniformInitializer(Initializer):
+    __slots__ = ["uniform_handle", "_uniform_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name)
-        self.add_layer(OpType.COS, name)
-        return Tensor(handle, owner_op_type=OpType.COS)
+    def __init__(self, seed, minv, maxv):
+        self.uniform_handle = ffc().flexflow_uniform_initializer_create(
+            seed, minv, maxv
+        )
+        self._uniform_handle = ffi.gc(
+            self.uniform_handle, ffc().flexflow_uniform_initializer_destroy
+        )
+        super(UniformInitializer, self).__init__(self.uniform_handle)
 
-    def add(self, x, y, inplace_a=False, name=None):
-        """Layer that adds two input Tensors, :attr:`output = x + y`.
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+# -----------------------------------------------------------------------
+# NormInitializer
+# -----------------------------------------------------------------------
 
-        :param y: the second input Tensor.
-        :type y: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class NormInitializer(Initializer):
+    __slots__ = ["norm_handle", "_norm_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_add(
-            self.handle, x.handle, y.handle, inplace_a, c_name
+    def __init__(self, seed, mean, stddev):
+        self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev)
+        self._norm_handle = ffi.gc(
+            self.norm_handle, ffc().flexflow_norm_initializer_destroy
         )
-        self.add_layer(OpType.ADD, name)
-        return Tensor(handle, owner_op_type=OpType.ADD)
+        super(NormInitializer, self).__init__(self.norm_handle)
 
-    def subtract(self, x, y, inplace_a=False, name=None):
-        """Layer that subtracts two input Tensors, :attr:`output = x * y`.
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+# -----------------------------------------------------------------------
+# PerfMetrics
+# -----------------------------------------------------------------------
 
-        :param y: the second input Tensor.
-        :type y: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class PerfMetrics(object):
+    __slots__ = ["handle", "_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_subtract(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.SUBTRACT, name)
-        return Tensor(handle, owner_op_type=OpType.SUBTRACT)
+    def __init__(self, handle):
+        self.handle = handle
+        self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy)
 
-    def multiply(self, x, y, inplace_a=False, name=None):
-        """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`.
+    def get_accuracy(self):
+        return ffc().flexflow_per_metrics_get_accuracy(self.handle)
 
-        :param x: the first input Tensor.
-        :type x: Tensor
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+# -----------------------------------------------------------------------
+# NetConfig
+# -----------------------------------------------------------------------
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_multiply(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.MULTIPLY, name)
-        return Tensor(handle, owner_op_type=OpType.MULTIPLY)
+class NetConfig(object):
+    def __init__(self):
+        self.handle = ffc().flexflow_net_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy)
+        cpath = ffc().flexflow_net_config_get_dataset_path(self.handle)
+        self.dataset_path = ffi.string(cpath)
 
-    def divide(self, x, y, inplace_a=False, name=None):
-        """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`.
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+# -----------------------------------------------------------------------
+# DLRMConfig
+# -----------------------------------------------------------------------
 
-        :param y: the second input Tensor.
-        :type y: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class DLRMConfig(object):
+    def __init__(self):
+        self.handle = ffc().flexflow_dlrm_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_divide(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.DIVIDE, name)
-        return Tensor(handle, owner_op_type=OpType.DIVIDE)
+        cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle)
+        self.dataset_path = ffi.string(cstr)
 
-    def max(self, x, y, inplace_a=False, name=None):
-        """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`.
+        cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle)
+        self.arch_interaction_op = ffi.string(cstr)
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+        self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size(
+            self.handle
+        )
+        self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle)
+        self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle)
+        self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size(
+            self.handle
+        )
+        self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle)
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+        mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle)
+        self.mlp_bot = []
+        for i in range(0, mlp_bot_c[0]):
+            self.mlp_bot.append(mlp_bot_c[i + 1])
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle)
+        self.mlp_top = []
+        for i in range(0, mlp_top_c[0]):
+            self.mlp_top.append(mlp_top_c[i + 1])
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_max(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.MAX, name)
-        return Tensor(handle, owner_op_type=OpType.MAX)
+        embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle)
+        self.embedding_size = []
+        for i in range(0, embedding_size_c[0]):
+            self.embedding_size.append(embedding_size_c[i + 1])
 
-    def min(self, x, y, inplace_a=False, name=None):
-        """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`.
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+# -----------------------------------------------------------------------
+# Single DataLoader
+# -----------------------------------------------------------------------
 
-        :param y: the second input Tensor.
-        :type y: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class SingleDataLoader(object):
+    __slots__ = ["handle", "_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_min(
-            self.handle, x.handle, y.handle, inplace_a, c_name
+    def __init__(self, ffmodel, input, full_input, num_samples, data_type):
+        assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong"
+        assert type(input) is Tensor, "SingleDataLoader input is wrong"
+        if type(full_input) is Tensor:
+            self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type)
+        else:
+            self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy)
+
+    def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type):
+        assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
+        c_data_type = enum_to_int(DataType, data_type)
+        self.handle = ffc().flexflow_single_dataloader_create(
+            ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type
         )
-        self.add_layer(OpType.MIN, name)
-        return Tensor(handle, owner_op_type=OpType.MIN)
 
-    def reduce_sum(self, input, axes, keepdims=False, name=None):
-        """Layer that computes the sum of the input Tensor along given axes.
+    def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type):
+        # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
+        c_data_type = enum_to_int(DataType, data_type)
+        self.handle = ffc().flexflow_single_dataloader_create2(
+            ffmodel.handle, input.handle, full_input, num_samples, c_data_type
+        )
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    @property
+    def num_samples(self):
+        return ffc().flexflow_single_dataloader_get_num_samples(self.handle)
 
-        :param axes: the axes along which reduction is applied
-        :type axes: List[int]
+    @num_samples.setter
+    def num_samples(self, samples):
+        ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def next_batch(self, ffmodel):
+        """Ask the dataloder to load the next batch to the :attr:`batch_tensor`.
 
-        :returns:  Tensor -- the output tensor.
+        :returns:  None -- no returns.
         """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handle = ffc().flexflow_model_add_reduce_sum(
-            self.handle, input.handle, c_axes, len(axes), keepdims, c_name
-        )
-        self.add_layer(OpType.REDUCE_SUM, name)
-        return Tensor(handle, owner_op_type=OpType.REDUCE_SUM)
+        ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle)
 
-    def rsqrt(self, input, name=None):
-        """Layer that computes the element-wise reciprocal square-root.
+    def reset(self):
+        """Reset the current position of the dataloder to 0.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_single_dataloader_reset(self.handle)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name)
-        self.add_layer(OpType.RSQRT, name)
-        return Tensor(handle, owner_op_type=OpType.RSQRT)
+class RegionNdarray(object):
+    __slots__ = ["__array_interface__"]
 
-    def pow(self, input, exponent, name=None):
-        """Layer that computes the element-wise power.
+    def __init__(self, shape, data_type, base_ptr, strides, read_only):
+        # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html
+        if data_type == DataType.DT_HALF:
+            field_type = "<f2"
+        elif data_type == DataType.DT_FLOAT:
+            field_type = "<f4"
+        elif data_type == DataType.DT_INT32:
+            field_type = "<i4"
+        else:
+            assert 0, "unknown data type"
+            field_type = "<f4"
+        self.__array_interface__ = {
+            "version": 3,
+            "shape": shape,
+            "typestr": field_type,
+            "data": (base_ptr, read_only),
+            "strides": strides,
+        }
 
-        :param input: the input Tensor.
-        :type input: Tensor
 
-        :param exponent: exponent to raise each element in the input tensor.
-        :type exponent: float
+# -----------------------------------------------------------------------
+# BatchConfig
+# -----------------------------------------------------------------------
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_pow(
-            self.handle, input.handle, exponent, c_name
-        )
-        self.add_layer(OpType.POW, name)
-        return Tensor(handle, owner_op_type=OpType.POW)
+class BatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-    def mean(self, input, dims, keepdims=False, name=None):
-        """Layer that computes the mean of the input tensor across the given
-        dimensions.
+    def __init__(self):
+        self.handle = ffc().flexflow_batch_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_batch_config_destroy)
 
-        :param input: the input Tensor.
-        :type input: Tensor
 
-        :param dims: dimensions to take the mean over.
-        :type dims: list
+# -----------------------------------------------------------------------
+# TreeVerifyBatchConfig
+# -----------------------------------------------------------------------
 
-        :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and
-                         collapses the dimension if False. Default is False.
-        :type keepdims: bool
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class TreeVerifyBatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        dims = list(dims)
-        c_dims = ffi.new("int[]", dims)
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_mean(
-            self.handle, input.handle, c_dims, len(dims), keepdims, c_name
+    def __init__(self):
+        self.handle = ffc().flexflow_tree_verify_batch_config_create()
+        self._handle = ffi.gc(
+            self.handle, ffc().flexflow_tree_verify_batch_config_destroy
         )
-        self.add_layer(OpType.MEAN, name)
-        return Tensor(handle, owner_op_type=OpType.MEAN)
 
-    def conv2d(
-        self,
-        input,
-        out_channels,
-        kernel_h,
-        kernel_w,
-        stride_h,
-        stride_w,
-        padding_h,
-        padding_w,
-        activation=ActiMode.AC_MODE_NONE,
-        groups=1,
-        use_bias=True,
-        shared_op=None,
-        kernel_initializer=None,
-        bias_initializer=None,
-        name=None,
-    ):
-        """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input`
-        to produce a tensor of :attr:`output`.
 
-        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
-        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
+# -----------------------------------------------------------------------
+# BeamSearchBatchConfig
+# -----------------------------------------------------------------------
 
-        .. math::
-          C_{out} = out\_channels
 
-        .. math::
-          K_{H} = kernel\_h
-
-        .. math::
-          K_{W} = kernel\_w
+class BatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        .. math::
-          S_{H} = stride\_h
+    def __init__(self):
+        self.handle = ffc().flexflow_beam_search_batch_config_create()
+        self._handle = ffi.gc(
+            self.handle, ffc().flexflow_beam_search_batch_config_destroy
+        )
 
-        .. math::
-          S_{W} = stride\_w
 
-        .. math::
-          P_{H} = padding\_h
+# -----------------------------------------------------------------------
+# RequestManager
+# -----------------------------------------------------------------------
 
-        .. math::
-          P_{S} = padding\_s
 
-        .. math::
-          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
+class RequestManager(object):
+    __slots__ = ["handle"]
 
-        .. math::
-          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
+    def __init__(self):
+        self.handle = ffc().flexflow_request_manager_get_request_manager()
+        # self._handle = ffi.gc(self.handle, ffc().flexflow_request_manager_destroy)
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def register_tokenizer(
+        self, model_type, bos_token_id, eos_token_id, tokenizer_filepath
+    ):
+        c_model_type = enum_to_int(ModelType, model_type)
+        c_tokenizer_filepath = get_c_name(tokenizer_filepath)
+        return ffc().flexflow_request_manager_register_tokenizer(
+            self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath
+        )
 
-        :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution).
-        :type out\_channels: int
+    def register_output_filepath(self, output_filepath):
+        c_output_filepath = get_c_name(output_filepath)
+        return ffc().flexflow_request_manager_register_output_filepath(
+            self.handle, c_output_filepath
+        )
 
-        :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`.
-        :type kernel_h: int
+    def register_ssm_model(self, model):
+        return ffc().flexflow_request_manager_register_ssm_model(
+            self.handle, model.handle
+        )
 
-        :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`.
-        :type kernel_w: int
+    def set_max_requests_per_batch(self, max_requests):
+        return ffc().flexflow_request_manager_set_max_requests_per_batch(
+            self.handle, max_requests
+        )
 
-        :param stride_h: the stride of the convolution along the height: :math:`S_{H}`.
-        :type stride_h: int
+    def set_max_tokens_per_batch(self, max_tokens):
+        return ffc().flexflow_request_manager_set_max_tokens_per_batch(
+            self.handle, max_tokens
+        )
 
-        :param stride_w: the stride of the convolution along the width: :math:`S_{W}`.
-        :type stride_w: int
+    def set_max_sequence_length(self, max_length):
+        return ffc().flexflow_request_manager_set_max_sequence_length(
+            self.handle, max_length
+        )
 
-        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
-        :type padding_h: int
+    def start_server(self, model):
+        return ffc().flexflow_request_manager_start_background_server(
+            self.handle, model.handle
+        )
 
-        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
-        :type padding_w: int
+    def stop_server(self):
+        return ffc().flexflow_request_manager_terminate_background_server(self.handle)
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
 
-        :param groups: the number of groups in this convolution
-        :type groups: int
+# -----------------------------------------------------------------------
+# InferenceManager
+# -----------------------------------------------------------------------
 
-        :param use_bias: whether the layer uses a bias vector. Default is True.
-        :type use_bias: bool
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
+class InferenceManager(object):
+    __slots__ = ["handle"]
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def __init__(self):
+        self.handle = ffc().flexflow_inference_manager_get_inference_manager()
+        # self._handle = ffi.gc(self.handle, ffc().flexflow_inference_manager_destroy)
 
-        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
-        :type bias_initializer: Initializer
+    def compile_model_and_allocate_buffer(self, model):
+        ffc().flexflow_inference_manager_compile_model_and_allocate_buffer(
+            self.handle, model.handle
+        )
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def init_operators_inference(self, model):
+        ffc().flexflow_inference_manager_init_operators_inference(
+            self.handle, model.handle
+        )
 
-        :returns:  Tensor -- the output tensor.
-        """
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_activation = enum_to_int(ActiMode, activation)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        bias_init_handle = self.__get_initializer_handle(bias_initializer)
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_conv2d(
-            self.handle,
-            input.handle,
-            out_channels,
-            kernel_h,
-            kernel_w,
-            stride_h,
-            stride_w,
-            padding_h,
-            padding_w,
-            c_activation,
-            groups,
-            use_bias,
-            shared_op_handle,
-            kernel_init_handle,
-            bias_init_handle,
-            c_name,
+    def register_model_weights_loader(self, model, fileloader):
+        ffc().flexflow_inference_manager_register_model_weights_loader(
+            self.handle, model.handle, fileloader.handle
         )
-        self.add_layer(OpType.CONV2D, name)
-        return Tensor(handle, owner_op_type=OpType.CONV2D)
 
-    def embedding(
-        self,
-        input,
-        num_embeddings,
-        embedding_dim,
-        aggr,
-        dtype=DataType.DT_FLOAT,
-        shared_op=None,
-        kernel_initializer=None,
-        name=None,
-    ):
-        """Layer that turns positive integers into dense vectors of fixed size
 
-        :param input: the input Tensor.
-        :type input: Tensor
+# -----------------------------------------------------------------------
+# FileDataLoader
+# -----------------------------------------------------------------------
 
-        :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1
-        :type num_embeddings: int
 
-        :param embedding_dim: dimension of the dense embedding.
-        :type embedding_dim: int
+class FileDataLoader(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG.
-        :type aggr: AggrMode
+    def __init__(
+        self,
+        weight_file_path,
+        num_q_heads,
+        num_kv_heads,
+        hidden_dim,
+        qkv_inner_dim,
+        tensor_parallelism_degree,
+        use_full_precision,
+    ):
+        c_weight_file_path = get_c_name(weight_file_path)
+        self.handle = ffc().flexflow_file_data_loader_create(
+            c_weight_file_path,
+            num_q_heads,
+            num_kv_heads,
+            hidden_dim,
+            qkv_inner_dim,
+            tensor_parallelism_degree,
+            use_full_precision,
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy)
 
-        :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE
-        :type dtype: DataType
+    def load_weights(self, model):
+        # Check data type and create use_full_precision boolean
+        # assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF
+        # use_full_precision = data_type == DataType.DT_FLOAT
+        ffc().flexflow_file_data_loader_load_weights(self.handle, model.handle)
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+# -----------------------------------------------------------------------
+# GenerationConfig
+# -----------------------------------------------------------------------
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_aggr = enum_to_int(AggrMode, aggr)
-        c_dtype = enum_to_int(DataType, dtype)
-        if kernel_initializer is None:
-            kernel_initializer = GlorotUniformInitializer(42)
-        assert (
-            (type(kernel_initializer) is GlorotUniformInitializer)
-            or (type(kernel_initializer) is ZeroInitializer)
-            or (type(kernel_initializer) is UniformInitializer)
-            or (type(kernel_initializer) is NormInitializer)
-        ), f"Unknown initializer type: {kernel_initializer}"
-        handle = ffc().flexflow_model_add_embedding(
-            self.handle,
-            input.handle,
-            num_embeddings,
-            embedding_dim,
-            c_aggr,
-            c_dtype,
-            shared_op_handle,
-            kernel_initializer.handle,
-            c_name,
-        )
-        # NOTE: We must keep a reference to the initializer or else it will be
-        # immediately destructed
-        self.initializers[name] = kernel_initializer
-        self.add_layer(OpType.EMBEDDING, name)
-        return Tensor(handle, owner_op_type=OpType.EMBEDDING)
+class GenerationConfig(object):
+    """A class to store the sampling configs."""
 
-    def pool2d(
+    def __init__(
         self,
-        input,
-        kernel_h,
-        kernel_w,
-        stride_h,
-        stride_w,
-        padding_h,
-        padding_w,
-        pool_type=PoolType.POOL_MAX,
-        activation=ActiMode.AC_MODE_NONE,
-        name=None,
+        do_sample: bool = False,
+        temperature: float = 0.9,
+        topp: float = 0.8,
+        topk: int = 1,
     ):
-        """Pooling operation for 2D spatial data.
-
-        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
-        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
-
-        .. math::
-          C_{out} = out\_channels
-
-        .. math::
-          K_{H} = kernel\_h
-
-        .. math::
-          K_{W} = kernel\_w
+        """Initialize the sampling configs
 
-        .. math::
-          S_{H} = stride\_h
+        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
+        :type do_sample: bool, optional
+        :param temperature: The temperature setting, defaults to 0.9
+        :type temperature: float, optional
+        :param topp: The top probabilities (top-p) setting, defaults to 0.8
+        :type topp: float, optional
+        :param topk: The top-k setting, defaults to 1
+        :type topk: int, optional
+        """
+        self.do_sample = do_sample
+        self.temperature = temperature
+        self.topp = topp
+        self.topk = topk
 
-        .. math::
-          S_{W} = stride\_w
 
-        .. math::
-          P_{H} = padding\_h
+# -----------------------------------------------------------------------
+# GenerationResult
+# -----------------------------------------------------------------------
 
-        .. math::
-          P_{S} = padding\_s
 
-        .. math::
-          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
+class GenerationResult(object):
+    """A class to store the output of a generation request."""
 
-        .. math::
-          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
+    def __init__(self, text: str = None, tokens: list = None):
+        self.output_text = text
+        self.output_tokens = tokens
 
-        :param input: the input Tensor.
-        :type input: Tensor
 
-        :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`.
-        :type kernel_h: int
+# -----------------------------------------------------------------------
+# LoraLinearConfig
+# -----------------------------------------------------------------------
 
-        :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`.
-        :type kernel_w: int
 
-        :param stride_h: the stride of the pooling along the height: :math:`S_{H}`.
-        :type stride_h: int
+class LoraLinearConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param stride_w: the stride of the pooling along the width: :math:`S_{W}`.
-        :type stride_w: int
+    def __init__(
+        self,
+        cache_folder,
+        peft_model_id,
+    ):
+        c_cache_folder = get_c_name(cache_folder)
+        peft_model_id = get_c_name(peft_model_id)
+        self.handle = ffc().flexflow_lora_linear_config_create(
+            c_cache_folder,
+            peft_model_id,
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_lora_linear_config_destroy)
 
-        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
-        :type padding_h: int
 
-        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
-        :type padding_w: int
+# -----------------------------------------------------------------------
+# PEFTModelID
+# -----------------------------------------------------------------------
 
-        :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied.
-        :type activation: PoolType
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
+class PEFTModelID(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def __init__(self, id=None):
+        if id is None:
+            self.handle = ffc().flexflow_peft_model_id_create()
+        else:
+            self.handle = ffc().flexflow_peft_model_id_create_id(id)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy)
+
+
+# -----------------------------------------------------------------------
+# Request
+# -----------------------------------------------------------------------
+
+
+class Request:
+    """A class to record the metadata of an inference or finetuning request."""
+
+    def __init__(
+        self,
+        req_type: RequestType,
+        prompt: str = None,
+        max_sequence_length: int = None,
+        peft_model_id: PEFTModelID = None,
+        dataset_filepath: str = None,
+        max_training_steps: int = None,
+    ):
+        self.req_type = req_type
+        self.prompt = prompt
+        self.max_sequence_length = max_sequence_length
+        self.peft_model_id = peft_model_id
+        self.dataset_filepath = dataset_filepath
+        self.max_training_steps = max_training_steps
+
+
+# -----------------------------------------------------------------------
+# FFModel
+# -----------------------------------------------------------------------
+
+
+class FFModel(object):
+    """ """
+
+    __slots__ = [
+        "handle",
+        "_handle",
+        "_layers",
+        "_nb_layers",
+        "_ffconfig",
+        "_tracing_id",
+        "initializers",
+        "attr_tensors",
+    ]
+
+    def __init__(self, ffconfig):
+        """Constructor of FFModel.
+
+        :param ffconfig: configurations of FlexFlow and the created model.
+        :type ffconfig: FFConfig
+
+        :returns:  FFModel -- the model.
+        """
+        self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy)
+        self._layers = dict()
+        self._nb_layers = 0
+        self._ffconfig = ffconfig
+        global ff_tracing_id
+        self._tracing_id = ff_tracing_id
+        ff_tracing_id += 1
+        self.initializers = {}
+        self.attr_tensors = {}
+
+    def get_layers(self):
+        return self._layers
+
+    def add_layer(self, op_type, name):
+        layer_id = self._nb_layers
+        op_handle = ffc().flexflow_model_get_last_layer(self.handle)
+        self._layers[self._nb_layers] = convert_op_handle_to_op(
+            op_type, op_handle, idx=layer_id, name=name
+        )
+        self._nb_layers += 1
+
+    def create_tensor(self, dims, data_type, create_grad=True):
+        """Instantiate a FlexFlow tensor.
+
+        :param x: a shape tuple/list (integers), including the batch size.
+        :type x: list of int
+
+        :param data_type: the datatype of the created tensor. Options are
+          DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN.
+        :type data_type: DataType
+
+        :param create_grad: weather the tensor creates a gradients vector.
+          If you don't specify anything, a gradients vector is used.
+        :type create_grad: bool
 
         :returns:  Tensor -- the output tensor.
         """
-        c_name = get_c_name(name)
-        c_pool_type = enum_to_int(PoolType, pool_type)
-        c_activation = enum_to_int(ActiMode, activation)
-        handle = ffc().flexflow_model_add_pool2d(
-            self.handle,
-            input.handle,
-            kernel_h,
-            kernel_w,
-            stride_h,
-            stride_w,
-            padding_h,
-            padding_w,
-            c_pool_type,
-            c_activation,
-            c_name,
+        c_dims = ffi.new("int[]", dims)
+        c_data_type = enum_to_int(DataType, data_type)
+        num_dims = len(dims)
+        handle = ffc().flexflow_tensor_create(
+            self.handle, num_dims, c_dims, c_data_type, create_grad
         )
-        self.add_layer(OpType.POOL2D, name)
-        return Tensor(handle, owner_op_type=OpType.POOL2D)
+        return Tensor(handle)
 
-    def batch_norm(self, input, relu=True, name=None):
-        """Layer that normalizes its inputs.
+    def map_tensor(self, tensor, parallel_op=None):
+        op_handle = self.__get_op_handle(parallel_op)
+        ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle)
 
-        Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1.
+    def create_constant(self, dims, value, data_type):
+        c_dims = ffi.new("int[]", dims)
+        c_data_type = enum_to_int(DataType, data_type)
+        num_dims = len(dims)
+        handle = ffc().flexflow_constant_create(
+            self.handle, num_dims, c_dims, value, c_data_type
+        )
+        return Tensor(handle)
 
-        :param input: the list of input Tensors.
-        :type input: Tensor
+    def exp(self, x, name=None):
+        """Exponential activation function.
 
-        :param relu: whether a ReLU function is applied. Default is True.
-        :type relu: bool
+        :param x: the input Tensor.
+        :type x: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -1895,253 +1892,113 @@ def batch_norm(self, input, relu=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_batch_norm(
-            self.handle, input.handle, relu, c_name
-        )
-        self.add_layer(OpType.BATCH_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.BATCH_NORM)
+        handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name)
+        self.add_layer(OpType.EXP, name)
+        return Tensor(handle, owner_op_type=OpType.EXP)
 
-    def layer_norm(
-        self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None
-    ):
-        """Add a LayerNorm layer
+    def sin(self, x, name=None):
+        """Elementwise sine function.
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: Union[int, List[int]]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: _type_, optional
-        :return: The LayerNorm output tensor
-        :rtype: Tensor
+        :param x: the input Tensor.
+        :type x: Tensor
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handle = ffc().flexflow_model_add_layer_norm(
-            self.handle,
-            input.handle,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            c_name,
-        )
-        self.add_layer(OpType.LAYER_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.LAYER_NORM)
-
-    def residual_layer_norm(
-        self,
-        input,
-        residual1,
-        residual2,
-        use_two_residuals,
-        axes,
-        elementwise_affine=True,
-        eps=1e-5,
-        use_bias=True,
-        inplace_residual=False,
-        name=None,
-    ):
-        """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in 
-        better efficiency compared to using separate element-wise add and LayerNorm operators.
+        handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name)
+        self.add_layer(OpType.SIN, name)
+        return Tensor(handle, owner_op_type=OpType.SIN)
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param residual1: The residual tensor to add to the input before computing the LayerNorm
-        :type residual1: Tensor
-        :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm
-        :type residual2: Tensor
-        :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise
-        :type use_two_residuals: bool
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: List[int]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
-        :type inplace_residual: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: str, optional
-        :return: A tensor with the sum of the input and residual(s), and the LayerNorm output
-        :rtype: (Tensor, Tensor)
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        residual2_handle = (
-            residual1.handle
-        )  # This is intentional. Data will be ignored, and we cannot pass None
-        if use_two_residuals:
-            assert residual2 is not None
-            residual2_handle = residual2.handle
-        handles_array = ffc().flexflow_model_add_residual_layer_norm(
-            self.handle,
-            input.handle,
-            residual1.handle,
-            residual2_handle,
-            use_two_residuals,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            inplace_residual,
-            c_name,
-        )
-        self.add_layer(OpType.RESIDUAL_LAYERNORM, name)
-        return Tensor(
-            handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM
-        ), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM)
+    def cos(self, x, name=None):
+        """Elementwise cosine function.
 
-    def add_bias_residual_layer_norm(
-        self,
-        input,
-        residual,
-        axes,
-        elementwise_affine=True,
-        eps=1e-5,
-        use_bias=True,
-        inplace_residual=False,
-        name=None,
-    ):
-        """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel, 
-        resulting in better efficiency compared to using separate attention bias addition + 
-        element-wise residual addition + LayerNorm operators.
+        :param x: the input Tensor.
+        :type x: Tensor
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param residual: The residual tensor
-        :type residual: Tensor
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: Union[int, List[int]]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
-        :type inplace_residual: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: _type_, optional
-        :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output
-        :rtype: (Tensor, Tensor)
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(
-            self.handle,
-            input.handle,
-            residual.handle,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            inplace_residual,
-            c_name,
-        )
-        self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name)
-        return Tensor(
-            handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM
-        ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM)
+        handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name)
+        self.add_layer(OpType.COS, name)
+        return Tensor(handle, owner_op_type=OpType.COS)
 
-    def sigmoid_silu_multi(self, input1, input2, name=None):
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sigmoid_silu_multi(
-            self.handle, input1.handle, input2.handle, c_name
-        )
-        self.add_layer(OpType.SIGMOID_SILU_MULTI, name)
-        return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI)
+    def add(self, x, y, inplace_a=False, name=None):
+        """Layer that adds two input Tensors, :attr:`output = x + y`.
 
-    def batch_matmul(
-        self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None
-    ):
-        """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`.
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param A: the first input Tensor.
-        :type A: Tensor
+        :param y: the second input Tensor.
+        :type y: Tensor
 
-        :param B: the second input Tensor.
-        :type B: Tensor
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension
-        :type a_seq_length_dim: int
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_add(
+            self.handle, x.handle, y.handle, inplace_a, c_name
+        )
+        self.add_layer(OpType.ADD, name)
+        return Tensor(handle, owner_op_type=OpType.ADD)
 
-        :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension
-        :type b_seq_length_dim: int
+    def subtract(self, x, y, inplace_a=False, name=None):
+        """Layer that subtracts two input Tensors, :attr:`output = x * y`.
+
+        :param x: the first input Tensor.
+        :type x: Tensor
+
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
-        :param name:  Whether to add use bias in layer normalization
-        :type name: bool
-
         :returns:  Tensor -- the output tensor.
         """
-        if a_seq_length_dim is None:
-            a_seq_length_dim = -1
-        if b_seq_length_dim is None:
-            b_seq_length_dim = -1
-        handle = ffc().flexflow_model_add_batch_matmul(
-            self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_subtract(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.BATCH_MATMUL, name)
-        return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL)
-
-    def dense(
-        self,
-        input,
-        out_dim,
-        activation=ActiMode.AC_MODE_NONE,
-        use_bias=True,
-        datatype=DataType.DT_NONE,
-        shared_op=None,
-        kernel_initializer=None,
-        bias_initializer=None,
-        kernel_regularizer=None,
-        name=None,
-    ):
-        """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where
-        :attr:`activation` is the element-wise activation function passed as the activation argument,
-        :attr:`kernel` is a weights matrix created by the layer, and
-        :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True).
-
-        The size of input tensor is :math:`(N, C_{in})` and the size of output tensor
-        is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim`
+        self.add_layer(OpType.SUBTRACT, name)
+        return Tensor(handle, owner_op_type=OpType.SUBTRACT)
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def multiply(self, x, y, inplace_a=False, name=None):
+        """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`.
 
-        :param out\_dim: dimensionality of the output space.
-        :type out\_dim: int
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
+        :param y: the second input Tensor.
+        :type y: Tensor
 
-        :param use_bias: whether the layer uses a bias vector. Default is True.
-        :type use_bias: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_multiply(
+            self.handle, x.handle, y.handle, inplace_a, c_name
+        )
+        self.add_layer(OpType.MULTIPLY, name)
+        return Tensor(handle, owner_op_type=OpType.MULTIPLY)
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def divide(self, x, y, inplace_a=False, name=None):
+        """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`.
 
-        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
-        :type bias_initializer: Initializer
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param kernel_regularizer: Regularizer for the kernel weights matrix
-        :type bias_initializer: Regularizer
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2149,125 +2006,78 @@ def dense(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_activation = enum_to_int(ActiMode, activation)
-        c_datatype = enum_to_int(DataType, datatype)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        bias_init_handle = self.__get_initializer_handle(bias_initializer)
-        if kernel_regularizer:
-            c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type)
-            kernel_reg_lambda = kernel_regularizer._lambda
-        else:
-            c_kernel_reg_type = enum_to_int(
-                RegularizerMode, RegularizerMode.REG_MODE_NONE
-            )
-            kernel_reg_lambda = 0.0
-        handle = ffc().flexflow_model_add_dense(
-            self.handle,
-            input.handle,
-            out_dim,
-            c_activation,
-            use_bias,
-            c_datatype,
-            shared_op_handle,
-            kernel_init_handle,
-            bias_init_handle,
-            c_kernel_reg_type,
-            kernel_reg_lambda,
-            c_name,
+        handle = ffc().flexflow_model_add_divide(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.LINEAR, name)
-        return Tensor(handle, owner_op_type=OpType.LINEAR)
-
-    def concat(self, tensors, axis, name=None):
-        """Layer that concatenates a list of inputs.
+        self.add_layer(OpType.DIVIDE, name)
+        return Tensor(handle, owner_op_type=OpType.DIVIDE)
 
-        It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs.
+    def max(self, x, y, inplace_a=False, name=None):
+        """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`.
 
-        :param input: the list of input Tensors.
-        :type input: List of Tensors
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param axis: the dimension along which to concatenate.
-        :type axis: int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
-        assert type(tensors) is list, "tensors should be a list"
-        tensor_handle_list = []
-        n = len(tensors)
-        assert n <= 256, "Please increase MAX_NUM_INPUTS"
-        for tensor in tensors:
-            tensor_handle_list.append(tensor.handle)
-        c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list)
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_concat(
-            self.handle, n, c_tensor_handle_list, axis, c_name
+        handle = ffc().flexflow_model_add_max(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.CONCAT, name)
-        return Tensor(handle, owner_op_type=OpType.CONCAT)
-
-    def split(self, input, sizes, axis, name=None):
-        """Layer that splits a :attr:`input` tensor into a list of tensors.
+        self.add_layer(OpType.MAX, name)
+        return Tensor(handle, owner_op_type=OpType.MAX)
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def min(self, x, y, inplace_a=False, name=None):
+        """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`.
 
-        :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`.
-        :type sizes: int or list of int
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param axis: the dimension along which to split.
-        :type axis: int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
-        :returns:  list of Tensors -- the output tensors.
+        :returns:  Tensor -- the output tensor.
         """
-        if type(sizes) is list:
-            split = sizes
-        else:
-            assert input.dims[axis] % sizes == 0, "Split dimension is not divisible"
-            split = [input.dims[axis] // sizes for i in range(sizes)]
-        n = len(split)
-        assert n <= 256, "Please increase MAX_NUM_OUTPUTS"
-        c_split = ffi.new("int[]", split)
-        c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]")
         c_name = get_c_name(name)
-        ffc().flexflow_model_add_split(
-            self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name
+        handle = ffc().flexflow_model_add_min(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        output_tensor_list = []
-        for i in range(n):
-            tensor_p_handle = ffi.new("flexflow_tensor_t*")
-            tensor_p_handle.impl = c_outputs_handle_list[i].impl
-            output_tensor_list.append(
-                Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle)
-            )
-        self.add_layer(OpType.SPLIT, name)
-        del c_outputs_handle_list
-        return output_tensor_list
+        self.add_layer(OpType.MIN, name)
+        return Tensor(handle, owner_op_type=OpType.MIN)
 
-    def flat(self, input, name=None):
-        """Flattens the input. Does not affect the batch size.
+    def reduce_sum(self, input, axes, keepdims=False, name=None):
+        """Layer that computes the sum of the input Tensor along given axes.
 
         :param input: the input Tensor.
         :type input: Tensor
 
+        :param axes: the axes along which reduction is applied
+        :type axes: List[int]
+
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name)
-        self.add_layer(OpType.FLAT, name)
-        return Tensor(handle, owner_op_type=OpType.FLAT)
+        c_axes = ffi.new("int[]", axes)
+        handle = ffc().flexflow_model_add_reduce_sum(
+            self.handle, input.handle, c_axes, len(axes), keepdims, c_name
+        )
+        self.add_layer(OpType.REDUCE_SUM, name)
+        return Tensor(handle, owner_op_type=OpType.REDUCE_SUM)
 
-    def softmax(self, input, axis=-1, name=None):
-        """Softmax activation function.
+    def rsqrt(self, input, name=None):
+        """Layer that computes the element-wise reciprocal square-root.
 
         :param input: the input Tensor.
         :type input: Tensor
@@ -2278,23 +2088,18 @@ def softmax(self, input, axis=-1, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_softmax(
-            self.handle, input.handle, axis, c_name
-        )
-        self.add_layer(OpType.SOFTMAX, name)
-        return Tensor(handle, owner_op_type=OpType.SOFTMAX)
-
-    def reshape(self, input, shape, name=None):
-        """Layer that reshapes inputs into the given shape.
+        handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name)
+        self.add_layer(OpType.RSQRT, name)
+        return Tensor(handle, owner_op_type=OpType.RSQRT)
 
-        Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order,
-        except with a new shape given by :attr:`shape`.
+    def pow(self, input, exponent, name=None):
+        """Layer that computes the element-wise power.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param shape: A list defining the shape of the output tensor.
-        :type shape: list of int
+        :param exponent: exponent to raise each element in the input tensor.
+        :type exponent: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2302,171 +2107,197 @@ def reshape(self, input, shape, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        c_shape = ffi.new("int[]", shape)
-        handle = ffc().flexflow_model_add_reshape(
-            self.handle, input.handle, len(shape), c_shape, c_name
+        handle = ffc().flexflow_model_add_pow(
+            self.handle, input.handle, exponent, c_name
         )
-        self.add_layer(OpType.RESHAPE, name)
-        return Tensor(handle, owner_op_type=OpType.RESHAPE)
+        self.add_layer(OpType.POW, name)
+        return Tensor(handle, owner_op_type=OpType.POW)
 
-    def gather(self, input, index, dim, name=None):
-        """Layer that gathers values along the dim axis.
+    def mean(self, input, dims, keepdims=False, name=None):
+        """Layer that computes the mean of the input tensor across the given
+        dimensions.
 
-        :param input: the input tensor
+        :param input: the input Tensor.
         :type input: Tensor
 
-        :param index: the index tensor, which specifies the indices of elements to gather
-        :type index: Tensor
+        :param dims: dimensions to take the mean over.
+        :type dims: list
 
-        :param dim: the axis along which to index
-        :type dim: int
+        :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and
+                         collapses the dimension if False. Default is False.
+        :type keepdims: bool
 
-        :param name: the name of the layer. Default is None
+        :param name: the name of the layer. Default is None.
         :type name: string
 
-        :returns: Tensor -- the output tensor
+        :returns:  Tensor -- the output tensor.
         """
+        dims = list(dims)
+        c_dims = ffi.new("int[]", dims)
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_gather(
-            self.handle, input.handle, index.handle, dim, c_name
+        handle = ffc().flexflow_model_add_mean(
+            self.handle, input.handle, c_dims, len(dims), keepdims, c_name
         )
-        self.add_layer(OpType.GATHER, name)
-        return Tensor(handle, owner_op_type=OpType.GATHER)
-
-    def transpose(self, input, perm, name=None):
-        """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm
+        self.add_layer(OpType.MEAN, name)
+        return Tensor(handle, owner_op_type=OpType.MEAN)
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def conv2d(
+        self,
+        input,
+        out_channels,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        padding_h,
+        padding_w,
+        activation=ActiMode.AC_MODE_NONE,
+        groups=1,
+        use_bias=True,
+        shared_op=None,
+        kernel_initializer=None,
+        bias_initializer=None,
+        name=None,
+    ):
+        """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input`
+        to produce a tensor of :attr:`output`.
 
-        :param perm: A permutation of the dimensions of a.
-        :type perm: List of int
+        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
+        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          C_{out} = out\_channels
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        c_perm = ffi.new("int[]", perm)
-        handle = ffc().flexflow_model_add_transpose(
-            self.handle, input.handle, len(perm), c_perm, c_name
-        )
-        self.add_layer(OpType.TRANSPOSE, name)
-        return Tensor(handle, owner_op_type=OpType.TRANSPOSE)
+        .. math::
+          K_{H} = kernel\_h
 
-    def reverse(self, input, axis, name=None):
-        """Layer that reverses specific dimensions of a tensor.
+        .. math::
+          K_{W} = kernel\_w
 
-        Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`.
+        .. math::
+          S_{H} = stride\_h
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        .. math::
+          S_{W} = stride\_w
 
-        :param axis: the dimension to reverse.
-        :type axis: int
+        .. math::
+          P_{H} = padding\_h
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          P_{S} = padding\_s
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_reverse(
-            self.handle, input.handle, axis, c_name
-        )
-        self.add_layer(OpType.REVERSE, name)
-        return Tensor(handle, owner_op_type=OpType.REVERSE)
+        .. math::
+          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
 
-    def scalar_multiply(self, input, scalar, inplace=True, name=None):
-        """Scalar multiplication of a tensor by an scalar.
+        .. math::
+          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param input: the scalar
-        :type scalar: float
+        :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution).
+        :type out\_channels: int
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`.
+        :type kernel_h: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_multiply(
-            self.handle, input.handle, scalar, inplace, c_name
-        )
-        self.add_layer(OpType.SCALAR_MULTIPLY, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY)
+        :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`.
+        :type kernel_w: int
 
-    def scalar_add(self, input, scalar, inplace=True, name=None):
-        """Scalar addition of a scalar to each entry of a tensor.
+        :param stride_h: the stride of the convolution along the height: :math:`S_{H}`.
+        :type stride_h: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param stride_w: the stride of the convolution along the width: :math:`S_{W}`.
+        :type stride_w: int
 
-        :param input: the scalar
-        :type scalar: float
+        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
+        :type padding_h: int
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
+        :type padding_w: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_add(
-            self.handle, input.handle, scalar, inplace, c_name
-        )
-        self.add_layer(OpType.SCALAR_ADD, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_ADD)
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
-    def scalar_sub(self, input, scalar, inplace=True, name=None):
-        """Scalar subtraction of a scalar to each entry of a tensor.
+        :param groups: the number of groups in this convolution
+        :type groups: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param use_bias: whether the layer uses a bias vector. Default is True.
+        :type use_bias: bool
 
-        :param input: the scalar
-        :type scalar: float
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
+
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
+        :type bias_initializer: Initializer
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_activation = enum_to_int(ActiMode, activation)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        bias_init_handle = self.__get_initializer_handle(bias_initializer)
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_sub(
-            self.handle, input.handle, scalar, inplace, c_name
+        handle = ffc().flexflow_model_add_conv2d(
+            self.handle,
+            input.handle,
+            out_channels,
+            kernel_h,
+            kernel_w,
+            stride_h,
+            stride_w,
+            padding_h,
+            padding_w,
+            c_activation,
+            groups,
+            use_bias,
+            shared_op_handle,
+            kernel_init_handle,
+            bias_init_handle,
+            c_name,
         )
-        self.add_layer(OpType.SCALAR_SUB, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_SUB)
+        self.add_layer(OpType.CONV2D, name)
+        return Tensor(handle, owner_op_type=OpType.CONV2D)
 
-    def scalar_true_divide(self, input, scalar, inplace=True, name=None):
-        """Scalar regular division of a tensor by an scalar.
+    def embedding(
+        self,
+        input,
+        num_embeddings,
+        embedding_dim,
+        aggr,
+        dtype=DataType.DT_FLOAT,
+        shared_op=None,
+        kernel_initializer=None,
+        name=None,
+    ):
+        """Layer that turns positive integers into dense vectors of fixed size
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param input: the scalar
-        :type scalar: float
+        :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1
+        :type num_embeddings: int
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param embedding_dim: dimension of the dense embedding.
+        :type embedding_dim: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_truediv(
-            self.handle, input.handle, scalar, inplace, c_name
-        )
-        self.add_layer(OpType.SCALAR_TRUEDIV, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV)
+        :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG.
+        :type aggr: AggrMode
 
-    def gelu(self, input, inplace=True, name=None):
-        """Gaussian Error Linear Unit activation function.
+        :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE
+        :type dtype: DataType
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
+
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2474,81 +2305,105 @@ def gelu(self, input, inplace=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name)
-        self.add_layer(OpType.GELU, name)
-        return Tensor(handle, owner_op_type=OpType.GELU)
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_aggr = enum_to_int(AggrMode, aggr)
+        c_dtype = enum_to_int(DataType, dtype)
+        if kernel_initializer is None:
+            kernel_initializer = GlorotUniformInitializer(42)
+        assert (
+            (type(kernel_initializer) is GlorotUniformInitializer)
+            or (type(kernel_initializer) is ZeroInitializer)
+            or (type(kernel_initializer) is UniformInitializer)
+            or (type(kernel_initializer) is NormInitializer)
+        ), f"Unknown initializer type: {kernel_initializer}"
+        handle = ffc().flexflow_model_add_embedding(
+            self.handle,
+            input.handle,
+            num_embeddings,
+            embedding_dim,
+            c_aggr,
+            c_dtype,
+            shared_op_handle,
+            kernel_initializer.handle,
+            c_name,
+        )
+        # NOTE: We must keep a reference to the initializer or else it will be
+        # immediately destructed
+        self.initializers[name] = kernel_initializer
+        self.add_layer(OpType.EMBEDDING, name)
+        return Tensor(handle, owner_op_type=OpType.EMBEDDING)
 
-    def relu(self, input, inplace=True, name=None):
-        """Rectified Linear Unit activation function.
+    def pool2d(
+        self,
+        input,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        padding_h,
+        padding_w,
+        pool_type=PoolType.POOL_MAX,
+        activation=ActiMode.AC_MODE_NONE,
+        name=None,
+    ):
+        """Pooling operation for 2D spatial data.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
+        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          C_{out} = out\_channels
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_relu(
-            self.handle, input.handle, inplace, c_name
-        )
-        self.add_layer(OpType.RELU, name)
-        return Tensor(handle, owner_op_type=OpType.RELU)
+        .. math::
+          K_{H} = kernel\_h
 
-    def identity(self, input, name=None):
-        """Identity function.
+        .. math::
+          K_{W} = kernel\_w
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        .. math::
+          S_{H} = stride\_h
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          S_{W} = stride\_w
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name)
-        self.add_layer(OpType.IDENTITY, name)
-        return Tensor(handle, owner_op_type=OpType.IDENTITY)
+        .. math::
+          P_{H} = padding\_h
 
-    def sigmoid(self, input, name=None):
-        """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`.
+        .. math::
+          P_{S} = padding\_s
+
+        .. math::
+          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
+
+        .. math::
+          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`.
+        :type kernel_h: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name)
-        self.add_layer(OpType.SIGMOID, name)
-        return Tensor(handle, owner_op_type=OpType.SIGMOID)
+        :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`.
+        :type kernel_w: int
 
-    def tanh(self, input, name=None):
-        """Hyperbolic tangent activation function.
+        :param stride_h: the stride of the pooling along the height: :math:`S_{H}`.
+        :type stride_h: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param stride_w: the stride of the pooling along the width: :math:`S_{W}`.
+        :type stride_w: int
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
+        :type padding_h: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name)
-        self.add_layer(OpType.TANH, name)
-        return Tensor(handle, owner_op_type=OpType.TANH)
+        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
+        :type padding_w: int
 
-    def elu(self, input, inplace=True, name=None):
-        """Exponential Linear Unit. activation function.
+        :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied.
+        :type activation: PoolType
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2556,27 +2411,34 @@ def elu(self, input, inplace=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_elu(
-            self.handle, input.handle, inplace, c_name
+        c_pool_type = enum_to_int(PoolType, pool_type)
+        c_activation = enum_to_int(ActiMode, activation)
+        handle = ffc().flexflow_model_add_pool2d(
+            self.handle,
+            input.handle,
+            kernel_h,
+            kernel_w,
+            stride_h,
+            stride_w,
+            padding_h,
+            padding_w,
+            c_pool_type,
+            c_activation,
+            c_name,
         )
-        self.add_layer(OpType.ELU, name)
-        return Tensor(handle, owner_op_type=OpType.ELU)
+        self.add_layer(OpType.POOL2D, name)
+        return Tensor(handle, owner_op_type=OpType.POOL2D)
 
-    def dropout(self, input, rate, seed, name=None):
-        """The Dropout layer randomly sets input units to 0 with
-        a frequency of :attr:`rate` at each step during training time,
-        which helps prevent overfitting.
-        Inputs not set to 0 are scaled up by 1/(1 - rate) such that the
-        sum over all inputs is unchanged.
+    def batch_norm(self, input, relu=True, name=None):
+        """Layer that normalizes its inputs.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1.
 
-        :param rate: Fraction of the input units to drop.
-        :type rate: float(0-1)
+        :param input: the list of input Tensors.
+        :type input: Tensor
 
-        :param seed: random seed.
-        :type seed: int
+        :param relu: whether a ReLU function is applied. Default is True.
+        :type relu: bool
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2584,165 +2446,253 @@ def dropout(self, input, rate, seed, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_dropout(
-            self.handle, input.handle, rate, seed, c_name
+        handle = ffc().flexflow_model_add_batch_norm(
+            self.handle, input.handle, relu, c_name
         )
-        self.add_layer(OpType.DROPOUT, name)
-        return Tensor(handle, owner_op_type=OpType.DROPOUT)
+        self.add_layer(OpType.BATCH_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.BATCH_NORM)
 
-    def multihead_attention(
-        self,
-        query,
-        key,
-        value,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        kernel_initializer=None,
-        name=None,
+    def layer_norm(
+        self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`,
-        and returns the dot-product attention between them:.
-
-        :param query: the query Tensor.
-        :type query: Tensor
+        """Add a LayerNorm layer
 
-        :param key: the key Tensor.
-        :type key: Tensor
+        :param input: The input tensor
+        :type input: Tensor
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: Union[int, List[int]]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: _type_, optional
+        :return: The LayerNorm output tensor
+        :rtype: Tensor
+        """
+        c_name = get_c_name(name)
+        c_axes = ffi.new("int[]", axes)
+        handle = ffc().flexflow_model_add_layer_norm(
+            self.handle,
+            input.handle,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
+            c_name,
+        )
+        self.add_layer(OpType.LAYER_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.LAYER_NORM)
 
-        :param value: the value Tensor.
-        :type value: Tensor
+    def residual_layer_norm(
+        self,
+        input,
+        residual1,
+        residual2,
+        use_two_residuals,
+        axes,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True,
+        inplace_residual=False,
+        name=None,
+    ):
+        """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in
+        better efficiency compared to using separate element-wise add and LayerNorm operators.
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
+        :param input: The input tensor
+        :type input: Tensor
+        :param residual1: The residual tensor to add to the input before computing the LayerNorm
+        :type residual1: Tensor
+        :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm
+        :type residual2: Tensor
+        :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise
+        :type use_two_residuals: bool
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: List[int]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
+        :type inplace_residual: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: str, optional
+        :return: A tensor with the sum of the input and residual(s), and the LayerNorm output
+        :rtype: (Tensor, Tensor)
+        """
+        c_name = get_c_name(name)
+        c_axes = ffi.new("int[]", axes)
+        residual2_handle = (
+            residual1.handle
+        )  # This is intentional. Data will be ignored, and we cannot pass None
+        if use_two_residuals:
+            assert residual2 is not None
+            residual2_handle = residual2.handle
+        handles_array = ffc().flexflow_model_add_residual_layer_norm(
+            self.handle,
+            input.handle,
+            residual1.handle,
+            residual2_handle,
+            use_two_residuals,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
+            inplace_residual,
+            c_name,
+        )
+        self.add_layer(OpType.RESIDUAL_LAYERNORM, name)
+        return Tensor(
+            handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM
+        ), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM)
 
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
+    def add_bias_residual_layer_norm(
+        self,
+        input,
+        residual,
+        axes,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True,
+        inplace_residual=False,
+        name=None,
+    ):
+        """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel,
+        resulting in better efficiency compared to using separate attention bias addition +
+        element-wise residual addition + LayerNorm operators.
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+        :param input: The input tensor
+        :type input: Tensor
+        :param residual: The residual tensor
+        :type residual: Tensor
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: Union[int, List[int]]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
+        :type inplace_residual: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: _type_, optional
+        :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output
+        :rtype: (Tensor, Tensor)
+        """
+        c_name = get_c_name(name)
+        c_axes = ffi.new("int[]", axes)
+        handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(
+            self.handle,
+            input.handle,
+            residual.handle,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
+            inplace_residual,
+            c_name,
+        )
+        self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name)
+        return Tensor(
+            handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM
+        ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM)
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+    def sigmoid_silu_multi(self, input1, input2, name=None):
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sigmoid_silu_multi(
+            self.handle, input1.handle, input2.handle, c_name
+        )
+        self.add_layer(OpType.SIGMOID_SILU_MULTI, name)
+        return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI)
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+    def batch_matmul(
+        self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None
+    ):
+        """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`.
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param A: the first input Tensor.
+        :type A: Tensor
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param B: the second input Tensor.
+        :type B: Tensor
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension
+        :type a_seq_length_dim: int
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension
+        :type b_seq_length_dim: int
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
+        :param name:  Whether to add use bias in layer normalization
+        :type name: bool
+
         :returns:  Tensor -- the output tensor.
         """
-        c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        handle = ffc().flexflow_model_add_multihead_attention(
-            self.handle,
-            query.handle,
-            key.handle,
-            value.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            kernel_init_handle,
-            c_name,
+        if a_seq_length_dim is None:
+            a_seq_length_dim = -1
+        if b_seq_length_dim is None:
+            b_seq_length_dim = -1
+        handle = ffc().flexflow_model_add_batch_matmul(
+            self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim
         )
-        self.add_layer(OpType.MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION)
+        self.add_layer(OpType.BATCH_MATMUL, name)
+        return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL)
 
-    def inc_multihead_self_attention(
+    def dense(
         self,
         input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
+        out_dim,
+        activation=ActiMode.AC_MODE_NONE,
+        use_bias=True,
+        datatype=DataType.DT_NONE,
+        shared_op=None,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
+        bias_initializer=None,
+        kernel_regularizer=None,
         name=None,
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        In inference mode, the attention is computed using incremental decoding.
+        """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where
+        :attr:`activation` is the element-wise activation function passed as the activation argument,
+        :attr:`kernel` is a weights matrix created by the layer, and
+        :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True).
+
+        The size of input tensor is :math:`(N, C_{in})` and the size of output tensor
+        is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim`
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param out\_dim: dimensionality of the output space.
+        :type out\_dim: int
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param use_bias: whether the layer uses a bias vector. Default is True.
+        :type use_bias: bool
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
-
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
-
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
-
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
+        :type bias_initializer: Initializer
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param kernel_regularizer: Regularizer for the kernel weights matrix
+        :type bias_initializer: Regularizer
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2750,205 +2700,152 @@ def inc_multihead_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_activation = enum_to_int(ActiMode, activation)
+        c_datatype = enum_to_int(DataType, datatype)
         kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multihead_self_attention(
+        bias_init_handle = self.__get_initializer_handle(bias_initializer)
+        if kernel_regularizer:
+            c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type)
+            kernel_reg_lambda = kernel_regularizer._lambda
+        else:
+            c_kernel_reg_type = enum_to_int(
+                RegularizerMode, RegularizerMode.REG_MODE_NONE
+            )
+            kernel_reg_lambda = 0.0
+        handle = ffc().flexflow_model_add_dense(
             self.handle,
             input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
+            out_dim,
+            c_activation,
+            use_bias,
+            c_datatype,
+            shared_op_handle,
             kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
+            bias_init_handle,
+            c_kernel_reg_type,
+            kernel_reg_lambda,
             c_name,
         )
-        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
-
-    def spec_inc_multihead_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (beam search) mode.
-
-        :param input: the input Tensor.
-        :type input: Tensor
-
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        self.add_layer(OpType.LINEAR, name)
+        return Tensor(handle, owner_op_type=OpType.LINEAR)
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+    def concat(self, tensors, axis, name=None):
+        """Layer that concatenates a list of inputs.
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs.
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param input: the list of input Tensors.
+        :type input: List of Tensors
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :param axis: the dimension along which to concatenate.
+        :type axis: int
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :returns:  Tensor -- the output tensor.
+        """
+        assert type(tensors) is list, "tensors should be a list"
+        tensor_handle_list = []
+        n = len(tensors)
+        assert n <= 256, "Please increase MAX_NUM_INPUTS"
+        for tensor in tensors:
+            tensor_handle_list.append(tensor.handle)
+        c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_concat(
+            self.handle, n, c_tensor_handle_list, axis, c_name
+        )
+        self.add_layer(OpType.CONCAT, name)
+        return Tensor(handle, owner_op_type=OpType.CONCAT)
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+    def split(self, input, sizes, axis, name=None):
+        """Layer that splits a :attr:`input` tensor into a list of tensors.
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`.
+        :type sizes: int or list of int
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param axis: the dimension along which to split.
+        :type axis: int
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
-        :returns:  Tensor -- the output tensor.
+        :returns:  list of Tensors -- the output tensors.
         """
+        if type(sizes) is list:
+            split = sizes
+        else:
+            assert input.dims[axis] % sizes == 0, "Split dimension is not divisible"
+            split = [input.dims[axis] // sizes for i in range(sizes)]
+        n = len(split)
+        assert n <= 256, "Please increase MAX_NUM_OUTPUTS"
+        c_split = ffi.new("int[]", split)
+        c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]")
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        ffc().flexflow_model_add_split(
+            self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name
         )
-        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+        output_tensor_list = []
+        for i in range(n):
+            tensor_p_handle = ffi.new("flexflow_tensor_t*")
+            tensor_p_handle.impl = c_outputs_handle_list[i].impl
+            output_tensor_list.append(
+                Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle)
+            )
+        self.add_layer(OpType.SPLIT, name)
+        del c_outputs_handle_list
+        return output_tensor_list
 
-    def inc_multihead_self_attention_verify(
-        self,
-        input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (tree verify) mode.
+    def flat(self, input, name=None):
+        """Flattens the input. Does not affect the batch size.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name)
+        self.add_layer(OpType.FLAT, name)
+        return Tensor(handle, owner_op_type=OpType.FLAT)
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+    def softmax(self, input, axis=-1, name=None):
+        """Softmax activation function.
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_softmax(
+            self.handle, input.handle, axis, c_name
+        )
+        self.add_layer(OpType.SOFTMAX, name)
+        return Tensor(handle, owner_op_type=OpType.SOFTMAX)
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+    def reshape(self, input, shape, name=None):
+        """Layer that reshapes inputs into the given shape.
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order,
+        except with a new shape given by :attr:`shape`.
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param shape: A list defining the shape of the output tensor.
+        :type shape: list of int
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2956,106 +2853,69 @@ def inc_multihead_self_attention_verify(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        c_shape = ffi.new("int[]", shape)
+        handle = ffc().flexflow_model_add_reshape(
+            self.handle, input.handle, len(shape), c_shape, c_name
         )
-        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.RESHAPE, name)
+        return Tensor(handle, owner_op_type=OpType.RESHAPE)
 
-    def inc_multiquery_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        In inference mode, the attention is computed using incremental decoding.
+    def gather(self, input, index, dim, name=None):
+        """Layer that gathers values along the dim axis.
 
-        :param input: the input Tensor.
+        :param input: the input tensor
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
-
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+        :param index: the index tensor, which specifies the indices of elements to gather
+        :type index: Tensor
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+        :param dim: the axis along which to index
+        :type dim: int
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :param name: the name of the layer. Default is None
+        :type name: string
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :returns: Tensor -- the output tensor
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_gather(
+            self.handle, input.handle, index.handle, dim, c_name
+        )
+        self.add_layer(OpType.GATHER, name)
+        return Tensor(handle, owner_op_type=OpType.GATHER)
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+    def transpose(self, input, perm, name=None):
+        """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :param perm: A permutation of the dimensions of a.
+        :type perm: List of int
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        c_perm = ffi.new("int[]", perm)
+        handle = ffc().flexflow_model_add_transpose(
+            self.handle, input.handle, len(perm), c_perm, c_name
+        )
+        self.add_layer(OpType.TRANSPOSE, name)
+        return Tensor(handle, owner_op_type=OpType.TRANSPOSE)
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+    def reverse(self, input, axis, name=None):
+        """Layer that reverses specific dimensions of a tensor.
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`.
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param axis: the dimension to reverse.
+        :type axis: int
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3063,107 +2923,62 @@ def inc_multiquery_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        handle = ffc().flexflow_model_add_reverse(
+            self.handle, input.handle, axis, c_name
         )
-        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
+        self.add_layer(OpType.REVERSE, name)
+        return Tensor(handle, owner_op_type=OpType.REVERSE)
 
-    def spec_inc_multiquery_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (beam search) mode.
+    def scalar_multiply(self, input, scalar, inplace=True, name=None):
+        """Scalar multiplication of a tensor by an scalar.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
-
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :param input: the scalar
+        :type scalar: float
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_scalar_multiply(
+            self.handle, input.handle, scalar, inplace, c_name
+        )
+        self.add_layer(OpType.SCALAR_MULTIPLY, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY)
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+    def scalar_add(self, input, scalar, inplace=True, name=None):
+        """Scalar addition of a scalar to each entry of a tensor.
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param input: the scalar
+        :type scalar: float
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_scalar_add(
+            self.handle, input.handle, scalar, inplace, c_name
+        )
+        self.add_layer(OpType.SCALAR_ADD, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_ADD)
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+    def scalar_sub(self, input, scalar, inplace=True, name=None):
+        """Scalar subtraction of a scalar to each entry of a tensor.
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3171,107 +2986,72 @@ def spec_inc_multiquery_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        handle = ffc().flexflow_model_add_scalar_sub(
+            self.handle, input.handle, scalar, inplace, c_name
         )
-        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.SCALAR_SUB, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_SUB)
 
-    def inc_multiquery_self_attention_verify(
-        self,
-        input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (tree verify) mode.
+    def scalar_true_divide(self, input, scalar, inplace=True, name=None):
+        """Scalar regular division of a tensor by an scalar.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
-
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+        :param input: the scalar
+        :type scalar: float
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_scalar_truediv(
+            self.handle, input.handle, scalar, inplace, c_name
+        )
+        self.add_layer(OpType.SCALAR_TRUEDIV, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV)
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+    def gelu(self, input, inplace=True, name=None):
+        """Gaussian Error Linear Unit activation function.
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name)
+        self.add_layer(OpType.GELU, name)
+        return Tensor(handle, owner_op_type=OpType.GELU)
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def relu(self, input, inplace=True, name=None):
+        """Rectified Linear Unit activation function.
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_relu(
+            self.handle, input.handle, inplace, c_name
+        )
+        self.add_layer(OpType.RELU, name)
+        return Tensor(handle, owner_op_type=OpType.RELU)
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+    def identity(self, input, name=None):
+        """Identity function.
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3279,126 +3059,75 @@ def inc_multiquery_self_attention_verify(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
-        )
-        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+        handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name)
+        self.add_layer(OpType.IDENTITY, name)
+        return Tensor(handle, owner_op_type=OpType.IDENTITY)
 
-    def rms_norm(self, input, eps, dim, name=None):
-        """Defines the RMS Norm layer.
+    def sigmoid(self, input, name=None):
+        """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param eps: a value added to the denominator for numerical stability
-        :type eps: float
-
-        :param dim: The dimension with respect to which to take the norm
-        :type dim: int
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_rms_norm(
-            self.handle, input.handle, eps, dim, c_name
-        )
-        self.add_layer(OpType.RMS_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.RMS_NORM)
-
-    def residual_rms_norm(self, input1, input2, eps, dim, inplace_residual=False, name=None):
-        """Defines the Residual RMS Norm layer.
+        handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name)
+        self.add_layer(OpType.SIGMOID, name)
+        return Tensor(handle, owner_op_type=OpType.SIGMOID)
 
-        :param input: the input 1 Tensor.
-        :type input: Tensor
+    def tanh(self, input, name=None):
+        """Hyperbolic tangent activation function.
 
-        :param input: the input 2 Tensor.
+        :param input: the input Tensor.
         :type input: Tensor
 
-        :param eps: a value added to the denominator for numerical stability
-        :type eps: float
-
-        :param dim: The dimension with respect to which to take the norm
-        :type dim: int
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
-        :param inplace_residual: whether to compute the residual inplace using the input tensor. Default is False.
-        :type inplace_residual: bool
-
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handles_array = ffc().flexflow_model_add_residual_rms_norm(
-            self.handle, input1.handle, input2.handle, eps, dim, inplace_residual, c_name
-        )
-        self.add_layer(OpType.RESIDUAL_RMS_NORM, name)
-        return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor(
-            handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM
-        )
+        handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name)
+        self.add_layer(OpType.TANH, name)
+        return Tensor(handle, owner_op_type=OpType.TANH)
 
-    def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
-        """Defines the Arg TopK layer.
+    def elu(self, input, inplace=True, name=None):
+        """Exponential Linear Unit. activation function.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param k: the top k indices to select
-        :type k: int
-
-        :param sorted: Whether the entries should be sorted
-        :type sorted: bool
-
-        :param speculative_decoding: Whether you need to perform beam search
-        :type speculative_decoding: bool
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_arg_top_k(
-            self.handle, input.handle, k, sorted, c_name
+        handle = ffc().flexflow_model_add_elu(
+            self.handle, input.handle, inplace, c_name
         )
-        self.add_layer(OpType.ARG_TOPK, name)
-        return Tensor(handle, owner_op_type=OpType.ARG_TOPK)
+        self.add_layer(OpType.ELU, name)
+        return Tensor(handle, owner_op_type=OpType.ELU)
 
-    def beam_top_k(self, input, max_beam_size, sorted, name=None):
-        """Defines the Beam TopK layer.
+    def dropout(self, input, rate, seed, name=None):
+        """The Dropout layer randomly sets input units to 0 with
+        a frequency of :attr:`rate` at each step during training time,
+        which helps prevent overfitting.
+        Inputs not set to 0 are scaled up by 1/(1 - rate) such that the
+        sum over all inputs is unchanged.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param max_beam_size: the top max_beam_size indices to select
-        :type max_beam_size: int
+        :param rate: Fraction of the input units to drop.
+        :type rate: float(0-1)
 
-        :param sorted: Whether the entries should be sorted
-        :type sorted: bool
+        :param seed: random seed.
+        :type seed: int
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3406,41 +3135,67 @@ def beam_top_k(self, input, max_beam_size, sorted, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_beam_top_k(
-            self.handle, input.handle, max_beam_size, sorted, c_name
+        handle = ffc().flexflow_model_add_dropout(
+            self.handle, input.handle, rate, seed, c_name
         )
-        self.add_layer(OpType.BEAM_TOPK, name)
-        return Tensor(handle, owner_op_type=OpType.BEAM_TOPK)
+        self.add_layer(OpType.DROPOUT, name)
+        return Tensor(handle, owner_op_type=OpType.DROPOUT)
 
-    def sampling(self, input, top_p, name=None):
-        """Defines the Sampling layer.
+    def multihead_attention(
+        self,
+        query,
+        key,
+        value,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        kernel_initializer=None,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`,
+        and returns the dot-product attention between them:.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param query: the query Tensor.
+        :type query: Tensor
 
-        :param top_p: The top_p parameter of the sampling
-        :type top_p: float
+        :param key: the key Tensor.
+        :type key: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param value: the value Tensor.
+        :type value: Tensor
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sampling(
-            self.handle, input.handle, top_p, c_name
-        )
-        self.add_layer(OpType.SAMPLING, name)
-        return Tensor(handle, owner_op_type=OpType.SAMPLING)
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-    def argmax(self, input, beam_search, name=None):
-        """Defines the Sampling layer.
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-        :param beam_search: Whether you need to perform beam search
-        :type beam_search: bool
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3448,960 +3203,1267 @@ def argmax(self, input, beam_search, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_argmax(
-            self.handle, input.handle, beam_search, c_name
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        handle = ffc().flexflow_model_add_multihead_attention(
+            self.handle,
+            query.handle,
+            key.handle,
+            value.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            kernel_init_handle,
+            c_name,
         )
-        self.add_layer(OpType.ARGMAX, name)
-        return Tensor(handle, owner_op_type=OpType.ARGMAX)
+        self.add_layer(OpType.MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION)
 
-    def reset_metrics(self):
-        """Reset performance metrics.
+    def inc_multihead_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        In inference mode, the attention is computed using incremental decoding.
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_reset_metrics(self.handle)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def init_layers(self):
-        """Initialize layers.
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_init_layers(self.handle)
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
 
-    def prefetch(self):
-        ffc().flexflow_model_prefetch(self.handle)
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-    def forward(self, seq_length=None):
-        """Forward propagation of all layers.
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-        :returns:  None -- no returns.
-        """
-        if seq_length is None:
-            seq_length = -1
-        ffc().flexflow_model_forward(self.handle, seq_length)
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-    # TODO: seperate compute_metrics from backward
-    def backward(self, seq_length=None):
-        """Backward propagation of all layers.
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-        :returns:  None -- no returns.
-        """
-        if seq_length is None:
-            seq_length = -1
-        ffc().flexflow_model_backward(self.handle, seq_length)
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-    def compute_metrics(self):
-        """Compute performance metrics.
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_compute_metrics(self.handle)
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
-    def update(self):
-        """Update weights and biases of all layers.
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_update(self.handle)
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
-    def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None):
-        """Configure the model for trainting. FlexFlow uses lazy initialization,
-        so the actual creating of all operations (including creating and partitioning
-        of weight, bias and output tensors) happen during compile.
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
-        :param optimizer: optimizer instance.
-        :type optimizer: Optimizer
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
-        :param loss_type: Enum of LossType.
-          Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
-          LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE.
-        :type loss_type: LossType
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
 
-        :param metrics: List of metrics to be evaluated by the model during training and testing.
-          Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY,
-          METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
-          METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR
-        :type metrics: MetricsType
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
 
-        :param comp_mode: Enum of CompMode.
-          Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE
-        :type comp_mode: CompMode
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        self.optimizer = optimizer
-
-        c_loss_type = enum_to_int(LossType, loss_type)
-        metrics_int = []
-        for metric in metrics:
-            metrics_int.append(enum_to_int(MetricsType, metric))
-        c_metrics = ffi.new("int[]", metrics_int)
-        if comp_mode == None:
-            comp_mode = CompMode.TRAINING
-        c_comp_mode = enum_to_int(CompMode, comp_mode)
-        ffc().flexflow_model_compile(
-            self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multihead_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
-        for ff_tensor, np_tensor in self.attr_tensors.items():
-            ff_tensor.set_tensor(self, np_tensor)
-        print("Compiled ffmodel!")
-
-    def fit(self, x=None, y=None, batch_size=None, epochs=1):
-        """Trains the model for a fixed number of epochs (iterations on a dataset).
-
-        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
-        :type x: Dataloader
-
-        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
-        :type y: Dataloader
+        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
 
-        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
-          or :attr:`--batch-size` from the command line.
-        :type batch_size: int
+    def spec_inc_multihead_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (beam search) mode.
 
-        :param epochs: Number of epochs to train the model.
-          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
-          The default value is 1.
-        :type epochs: int
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :returns:  None -- no returns.
-        """
-        if isinstance(x, list) == False:
-            dataloaders = [x]
-        else:
-            dataloaders = x
-        dataloaders.append(y)
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-        num_samples = y.num_samples
-        batch_size = self._ffconfig.batch_size
-        self._tracing_id += 1  # get a new tracing id
-        for epoch in range(0, epochs):
-            for d in dataloaders:
-                d.reset()
-            self.reset_metrics()
-            iterations = num_samples / batch_size
-            for iter in range(0, int(iterations)):
-                self._ffconfig.begin_trace(self._tracing_id)
-                for d in dataloaders:
-                    d.next_batch(self)
-                self.forward()
-                self.zero_gradients()
-                self.backward()
-                self.update()
-                self._ffconfig.end_trace(self._tracing_id)
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
 
-    def eval(self, x=None, y=None, batch_size=None):
-        """Returns the loss value & metrics values for the model in test mode.
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
-        :type x: Dataloader
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
-        :type y: Dataloader
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
-          or :attr:`--batch-size` from the command line.
-        :type batch_size: int
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-        :param epochs: Number of epochs to train the model.
-          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
-          The default value is 1.
-        :type epochs: int
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-        :returns:  None -- no returns.
-        """
-        if isinstance(x, list) == False:
-            dataloaders = [x]
-        else:
-            dataloaders = x
-        dataloaders.append(y)
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-        num_samples = y.num_samples
-        batch_size = self._ffconfig.batch_size
-        for d in dataloaders:
-            d.reset()
-        self.reset_metrics()
-        iterations = num_samples / batch_size
-        self._tracing_id += 1  # get a new tracing id
-        for iter in range(0, int(iterations)):
-            for d in dataloaders:
-                d.next_batch(self)
-            self._ffconfig.begin_trace(self._tracing_id)
-            self.forward()
-            self.compute_metrics()
-            self._ffconfig.end_trace(self._tracing_id)
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
-    def zero_gradients(self):
-        """Empty the gradients of all layers.
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_zero_gradients(self.handle)
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
-    def set_optimizer(self, optimizer):
-        if isinstance(optimizer, SGDOptimizer) == True:
-            ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle)
-        elif isinstance(optimizer, AdamOptimizer) == True:
-            ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle)
-        elif optimizer == None:
-            pass
-        else:
-            assert 0, "[Model]: unknown optimizer"
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
-    optimizer = property(fset=set_optimizer)
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
-    def print_layers(self, id=-1):
-        ffc().flexflow_model_print_layers(self.handle, id)
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
 
-    def get_layer_by_id(self, layer_id):
-        return self._layers[layer_id]
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
 
-    def get_last_layer(self):
-        return self._layers[self._nb_layers - 1]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def get_layer_by_name(self, layer_name):
-        for layer_id in self._layers:
-            layer = self._layers[layer_id]
-            if layer.name == layer_name:
-                return layer
-        assert 0, f"Cannot find the layer with name {layer_name}"
-        return None
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
 
-    def get_tensor_by_id(self, id):
-        handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id)
-        return Parameter(handle)
+    def inc_multihead_self_attention_verify(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (tree verify) mode.
 
-    @property
-    def label_tensor(self):
-        handle = ffc().flexflow_model_get_label_tensor(self.handle)
-        return Tensor(handle, deallocate=False)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def get_perf_metrics(self):
-        handle = ffc().flexflow_model_get_perf_metrics(self.handle)
-        return PerfMetrics(handle)
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-    def set_transformer_layer_id(self, id):
-        ffc().flexflow_model_set_transformer_layer_id(self.handle, id)
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
 
-    def create_data_loader(self, batch_tensor, full_array):
-        """Create a SingleDataloader instance.
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-        :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model.
-        :type batch_tensor: Tensor
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-        :param full_array: the entire data.
-        :type full_array: Numpy Array
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-        :returns:  SingleDataloader -- returns a dataloader instance.
-        """
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-        if self._ffconfig.enable_control_replication:
-            assert (
-                self._ffconfig.python_data_loader_type != 1
-            ), "To enable control replication, please set --python-data-loader-type 2"
-            return self.__create_data_loader_ptr(batch_tensor, full_array)
-        else:
-            if self._ffconfig.python_data_loader_type == 1:
-                return self.__create_data_loader_attach(batch_tensor, full_array)
-            else:
-                return self.__create_data_loader_ptr(batch_tensor, full_array)
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-    def __create_data_loader_attach(self, batch_tensor, full_array):
-        full_array_shape = full_array.shape
-        num_samples = full_array_shape[0]
-        num_dim = len(full_array_shape)
-        if full_array.dtype == "float16":
-            datatype = DataType.DT_HALF
-        elif full_array.dtype == "float32":
-            datatype = DataType.DT_FLOAT
-        elif full_array.dtype == "int32":
-            datatype = DataType.DT_INT32
-        elif full_array.dtype == "int64":
-            datatype = DataType.DT_INT64
-        else:
-            assert 0, "unsupported datatype"
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-        if num_dim == 2:
-            full_tensor = self.create_tensor(
-                [num_samples, full_array_shape[1]], datatype
-            )
-            self.map_tensor(full_tensor)
-        elif num_dim == 4:
-            full_tensor = self.create_tensor(
-                [
-                    num_samples,
-                    full_array_shape[1],
-                    full_array_shape[2],
-                    full_array_shape[3],
-                ],
-                datatype,
-            )
-            self.map_tensor(full_tensor)
-        else:
-            assert 0, "unsupported dims"
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
-        full_tensor.attach_numpy_array(self._ffconfig, full_array)
-        dataloader = SingleDataLoader(
-            self, batch_tensor, full_tensor, num_samples, datatype
-        )
-        full_tensor.detach_numpy_array(self._ffconfig)
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-        return dataloader
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
-    def __create_data_loader_ptr(self, batch_tensor, full_array):
-        full_array_shape = full_array.shape
-        num_samples = full_array_shape[0]
-        if full_array.dtype == "float16":
-            datatype = DataType.DT_HALF
-        elif full_array.dtype == "float32":
-            datatype = DataType.DT_FLOAT
-        elif full_array.dtype == "int32":
-            datatype = DataType.DT_INT32
-        elif full_array.dtype == "int64":
-            datatype = DataType.DT_INT64
-        else:
-            assert 0, "unsupported datatype"
-        np_raw_ptr = full_array.__array_interface__["data"]
-        raw_ptr = ffi.cast("float*", np_raw_ptr[0])
-        print(
-            "numpy array: %s, %s, %s"
-            % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0]))
-        )
-        dataloader = SingleDataLoader(
-            self, batch_tensor, raw_ptr, num_samples, datatype
-        )
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
-        return dataloader
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
-    def __get_initializer_handle(self, initializer):
-        if initializer == None:
-            null_initializer = Initializer(None)
-            return null_initializer.handle
-        else:
-            return initializer.handle
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
 
-    def __get_op_handle(self, shared_op):
-        if shared_op == None:
-            op_handle = ffi.new("flexflow_op_t *")
-            op_handle.impl = ffi.NULL
-            op = Op(op_handle[0])
-        else:
-            op = shared_op
-        return op.handle
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
 
-    def get_output_tensor(self, ffmodel, data_type):
-        shape = self.dims
-        if data_type == DataType.DT_HALF:
-            np_array = np.empty(shape, dtype=np.float16)
-        elif data_type == DataType.DT_FLOAT:
-            np_array = np.empty(shape, dtype=np.float32)
-        elif self.data_type == DataType.DT_INT32:
-            np_array = np.empty(shape, dtype=np.int32)
-        elif self.data_type == DataType.DT_INT64:
-            np_array = np.empty(shape, dtype=np.int64)
-        else:
-            assert 0, f"Unsupported datatype: {self.data_type}"
-        np_raw_ptr = np_array.__array_interface__["data"]
-        if np_array.dtype == np.float32:
-            raw_ptr = ffi.cast("float*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_float(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        elif np_array.dtype == np.int32:
-            raw_ptr = ffi.cast("int*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_int(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        elif np_array.dtype == np.int64:
-            raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_int64(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        fflogger.debug(
-            "get weights raw_ptr: %s, %s, %s, %s"
-            % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))
-        )
-        assert ret_val == True
-        return np_array
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128):
-        assert isinstance(prompt_list, list)
-        c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
-        max_num_chars = 5 * (max_sequence_length + 100)
-        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
-        c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list]
-        c_request_types = [enum_to_int(RequestType, RequestType.REQ_INFERENCE) for prompt in prompt_list]
-        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
-        peft_model_ids = [None for prompt in prompt_list]
-        dataset_filepaths = [None for prompt in prompt_list]
-        training_steps = [0 for prompt in prompt_list]
-        ffc().flexflow_model_generate(
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(
             self.handle,
-            len(prompt_list),
-            c_request_types,
-            c_input_texts,
-            c_output_texts,
-            max_sequence_lengths,
-            peft_model_ids,
-            dataset_filepaths,
-            training_steps,
-            c_output_length_and_tokens,
-        )
-        from flexflow.serve import GenerationResult
-        return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts]
-    
-    def generate(self, requests_list: List[Request]):
-        assert isinstance(requests_list, list)
-        c_input_texts = [get_c_name(request.prompt) for request in requests_list] # entry will be None for finetuning requests
-        c_output_texts = [ffi.new("char[]", 5 * (request.max_sequence_length + 100)) if request.req_type == RequestType.REQ_INFERENCE else ffi.NULL for request in requests_list]
-        c_output_length_and_tokens = [ffi.new("int[]", request.max_sequence_length + 100) for request in requests_list]
-        c_request_types = [enum_to_int(RequestType, request.req_type) for request in requests_list]
-        max_sequence_lengths = [request.max_sequence_length for request in requests_list]
-        peft_model_ids = [request.peft_model_id for request in requests_list]
-        dataset_filepaths = [request.dataset_filepath for request in requests_list]
-        training_steps = [request.max_training_steps for request in requests_list]
-        ffc().flexflow_model_generate(
-            self.handle,
-            len(requests_list),
-            c_request_types,
-            c_input_texts,
-            c_output_texts,
-            max_sequence_lengths,
-            peft_model_ids,
-            dataset_filepaths,
-            training_steps,
-            c_output_length_and_tokens,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
-        return [GenerationResult(ffi.string(c_output_text), []) if c_output_text != ffi.NULL else None for c_output_text in c_output_texts]
-
-    def set_position_offset(self, offset):
-        ffc().flexflow_model_set_position_offset(self.handle, offset)
+        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
 
+    def inc_multiquery_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        In inference mode, the attention is computed using incremental decoding.
 
-# -----------------------------------------------------------------------
-# SGDOptimizer
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-class SGDOptimizer(object):
-    __slots__ = ["handle", "_handle"]
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
 
-    def __init__(
-        self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0
-    ):
-        self.handle = ffc().flexflow_sgd_optimizer_create(
-            ffmodel.handle, lr, momentum, nesterov, weight_decay
-        )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy)
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
 
-    def set_learning_rate(self, learning_rate):
-        ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate)
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-# -----------------------------------------------------------------------
-# AdamOptimizer
-# -----------------------------------------------------------------------
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-class AdamOptimizer(object):
-    __slots__ = ["handle", "_handle"]
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-    def __init__(
-        self,
-        ffmodel,
-        alpha=0.001,
-        beta1=0.9,
-        beta2=0.999,
-        weight_decay=0.0,
-        epsilon=1e-8,
-    ):
-        self.handle = ffc().flexflow_adam_optimizer_create(
-            ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon
-        )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy)
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-    def set_learning_rate(self, learning_rate):
-        ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate)
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-# -----------------------------------------------------------------------
-# Initializer
-# -----------------------------------------------------------------------
-class Initializer(object):
-    __slots__ = ["handle", "p_handle"]
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
-    def __init__(self, handle, p_handle=0):
-        self.p_handle = ffi.new("flexflow_initializer_t *")
-        if handle == None:
-            self.p_handle.impl = ffi.NULL
-        else:
-            self.p_handle.impl = handle.impl
-        self.handle = self.p_handle[0]
-        assert ffi.typeof(self.handle) == ffi.typeof(
-            "flexflow_initializer_t"
-        ), "Initializer handle is wrong"
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
-# -----------------------------------------------------------------------
-# GlorotUniform
-# -----------------------------------------------------------------------
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
 
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
 
-class GlorotUniformInitializer(Initializer):
-    __slots__ = ["glorot_handle", "_glorot_handle"]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __init__(self, seed):
-        self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed)
-        self._glorot_handle = ffi.gc(
-            self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
-        super(GlorotUniformInitializer, self).__init__(self.glorot_handle)
+        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
 
+    def spec_inc_multiquery_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (beam search) mode.
 
-# -----------------------------------------------------------------------
-# ZeroInitializer
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-class ZeroInitializer(Initializer):
-    __slots__ = ["zero_handle", "_zero_handle"]
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
 
-    def __init__(self):
-        self.zero_handle = ffc().flexflow_zero_initializer_create()
-        self._zero_handle = ffi.gc(
-            self.zero_handle, ffc().flexflow_zero_initializer_destroy
-        )
-        super(ZeroInitializer, self).__init__(self.zero_handle)
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
 
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-# -----------------------------------------------------------------------
-# UniformInitializer
-# -----------------------------------------------------------------------
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-class UniformInitializer(Initializer):
-    __slots__ = ["uniform_handle", "_uniform_handle"]
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-    def __init__(self, seed, minv, maxv):
-        self.uniform_handle = ffc().flexflow_uniform_initializer_create(
-            seed, minv, maxv
-        )
-        self._uniform_handle = ffi.gc(
-            self.uniform_handle, ffc().flexflow_uniform_initializer_destroy
-        )
-        super(UniformInitializer, self).__init__(self.uniform_handle)
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-# -----------------------------------------------------------------------
-# NormInitializer
-# -----------------------------------------------------------------------
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-class NormInitializer(Initializer):
-    __slots__ = ["norm_handle", "_norm_handle"]
-
-    def __init__(self, seed, mean, stddev):
-        self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev)
-        self._norm_handle = ffi.gc(
-            self.norm_handle, ffc().flexflow_norm_initializer_destroy
-        )
-        super(NormInitializer, self).__init__(self.norm_handle)
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
-# -----------------------------------------------------------------------
-# PerfMetrics
-# -----------------------------------------------------------------------
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
 
-class PerfMetrics(object):
-    __slots__ = ["handle", "_handle"]
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
 
-    def __init__(self, handle):
-        self.handle = handle
-        self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy)
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def get_accuracy(self):
-        return ffc().flexflow_per_metrics_get_accuracy(self.handle)
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
 
+    def inc_multiquery_self_attention_verify(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (tree verify) mode.
 
-# -----------------------------------------------------------------------
-# NetConfig
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-class NetConfig(object):
-    def __init__(self):
-        self.handle = ffc().flexflow_net_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy)
-        cpath = ffc().flexflow_net_config_get_dataset_path(self.handle)
-        self.dataset_path = ffi.string(cpath)
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
 
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
 
-# -----------------------------------------------------------------------
-# DLRMConfig
-# -----------------------------------------------------------------------
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-class DLRMConfig(object):
-    def __init__(self):
-        self.handle = ffc().flexflow_dlrm_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy)
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-        cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle)
-        self.dataset_path = ffi.string(cstr)
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-        cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle)
-        self.arch_interaction_op = ffi.string(cstr)
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-        self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size(
-            self.handle
-        )
-        self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle)
-        self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle)
-        self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size(
-            self.handle
-        )
-        self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle)
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-        mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle)
-        self.mlp_bot = []
-        for i in range(0, mlp_bot_c[0]):
-            self.mlp_bot.append(mlp_bot_c[i + 1])
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
-        mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle)
-        self.mlp_top = []
-        for i in range(0, mlp_top_c[0]):
-            self.mlp_top.append(mlp_top_c[i + 1])
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-        embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle)
-        self.embedding_size = []
-        for i in range(0, embedding_size_c[0]):
-            self.embedding_size.append(embedding_size_c[i + 1])
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
-# -----------------------------------------------------------------------
-# Single DataLoader
-# -----------------------------------------------------------------------
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
 
-class SingleDataLoader(object):
-    __slots__ = ["handle", "_handle"]
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
 
-    def __init__(self, ffmodel, input, full_input, num_samples, data_type):
-        assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong"
-        assert type(input) is Tensor, "SingleDataLoader input is wrong"
-        if type(full_input) is Tensor:
-            self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type)
-        else:
-            self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type)
-        self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy)
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type):
-        assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
         c_data_type = enum_to_int(DataType, data_type)
-        self.handle = ffc().flexflow_single_dataloader_create(
-            ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type
+        handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
+        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
 
-    def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type):
-        # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
-        c_data_type = enum_to_int(DataType, data_type)
-        self.handle = ffc().flexflow_single_dataloader_create2(
-            ffmodel.handle, input.handle, full_input, num_samples, c_data_type
-        )
+    def rms_norm(self, input, eps, dim, name=None):
+        """Defines the RMS Norm layer.
 
-    @property
-    def num_samples(self):
-        return ffc().flexflow_single_dataloader_get_num_samples(self.handle)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    @num_samples.setter
-    def num_samples(self, samples):
-        ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples)
+        :param eps: a value added to the denominator for numerical stability
+        :type eps: float
 
-    def next_batch(self, ffmodel):
-        """Ask the dataloder to load the next batch to the :attr:`batch_tensor`.
+        :param dim: The dimension with respect to which to take the norm
+        :type dim: int
 
-        :returns:  None -- no returns.
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
         """
-        ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_rms_norm(
+            self.handle, input.handle, eps, dim, c_name
+        )
+        self.add_layer(OpType.RMS_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.RMS_NORM)
 
-    def reset(self):
-        """Reset the current position of the dataloder to 0.
+    def residual_rms_norm(
+        self, input1, input2, eps, dim, inplace_residual=False, name=None
+    ):
+        """Defines the Residual RMS Norm layer.
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_single_dataloader_reset(self.handle)
+        :param input: the input 1 Tensor.
+        :type input: Tensor
 
+        :param input: the input 2 Tensor.
+        :type input: Tensor
 
-class RegionNdarray(object):
-    __slots__ = ["__array_interface__"]
+        :param eps: a value added to the denominator for numerical stability
+        :type eps: float
 
-    def __init__(self, shape, data_type, base_ptr, strides, read_only):
-        # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html
-        if data_type == DataType.DT_HALF:
-            field_type = "<f2"
-        elif data_type == DataType.DT_FLOAT:
-            field_type = "<f4"
-        elif data_type == DataType.DT_INT32:
-            field_type = "<i4"
-        else:
-            assert 0, "unknown data type"
-            field_type = "<f4"
-        self.__array_interface__ = {
-            "version": 3,
-            "shape": shape,
-            "typestr": field_type,
-            "data": (base_ptr, read_only),
-            "strides": strides,
-        }
+        :param dim: The dimension with respect to which to take the norm
+        :type dim: int
 
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-# -----------------------------------------------------------------------
-# BatchConfig
-# -----------------------------------------------------------------------
+        :param inplace_residual: whether to compute the residual inplace using the input tensor. Default is False.
+        :type inplace_residual: bool
 
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handles_array = ffc().flexflow_model_add_residual_rms_norm(
+            self.handle,
+            input1.handle,
+            input2.handle,
+            eps,
+            dim,
+            inplace_residual,
+            c_name,
+        )
+        self.add_layer(OpType.RESIDUAL_RMS_NORM, name)
+        return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor(
+            handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM
+        )
 
-class BatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+    def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
+        """Defines the Arg TopK layer.
 
-    def __init__(self):
-        self.handle = ffc().flexflow_batch_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_batch_config_destroy)
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param k: the top k indices to select
+        :type k: int
 
-# -----------------------------------------------------------------------
-# TreeVerifyBatchConfig
-# -----------------------------------------------------------------------
+        :param sorted: Whether the entries should be sorted
+        :type sorted: bool
 
+        :param speculative_decoding: Whether you need to perform beam search
+        :type speculative_decoding: bool
 
-class TreeVerifyBatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __init__(self):
-        self.handle = ffc().flexflow_tree_verify_batch_config_create()
-        self._handle = ffi.gc(
-            self.handle, ffc().flexflow_tree_verify_batch_config_destroy
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_arg_top_k(
+            self.handle, input.handle, k, sorted, c_name
         )
+        self.add_layer(OpType.ARG_TOPK, name)
+        return Tensor(handle, owner_op_type=OpType.ARG_TOPK)
 
+    def beam_top_k(self, input, max_beam_size, sorted, name=None):
+        """Defines the Beam TopK layer.
 
-# -----------------------------------------------------------------------
-# BeamSearchBatchConfig
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param max_beam_size: the top max_beam_size indices to select
+        :type max_beam_size: int
 
-class BatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+        :param sorted: Whether the entries should be sorted
+        :type sorted: bool
 
-    def __init__(self):
-        self.handle = ffc().flexflow_beam_search_batch_config_create()
-        self._handle = ffi.gc(
-            self.handle, ffc().flexflow_beam_search_batch_config_destroy
-        )
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_beam_top_k(
+            self.handle, input.handle, max_beam_size, sorted, c_name
+        )
+        self.add_layer(OpType.BEAM_TOPK, name)
+        return Tensor(handle, owner_op_type=OpType.BEAM_TOPK)
 
-# -----------------------------------------------------------------------
-# RequestManager
-# -----------------------------------------------------------------------
+    def sampling(self, input, top_p, name=None):
+        """Defines the Sampling layer.
 
+        :param input: the input Tensor.
+        :type input: Tensor
 
-class RequestManager(object):
-    __slots__ = ["handle"]
+        :param top_p: The top_p parameter of the sampling
+        :type top_p: float
 
-    def __init__(self):
-        self.handle = ffc().flexflow_request_manager_get_request_manager()
-        # self._handle = ffi.gc(self.handle, ffc().flexflow_request_manager_destroy)
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def register_tokenizer(
-        self, model_type, bos_token_id, eos_token_id, tokenizer_filepath
-    ):
-        c_model_type = enum_to_int(ModelType, model_type)
-        c_tokenizer_filepath = get_c_name(tokenizer_filepath)
-        return ffc().flexflow_request_manager_register_tokenizer(
-            self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sampling(
+            self.handle, input.handle, top_p, c_name
         )
+        self.add_layer(OpType.SAMPLING, name)
+        return Tensor(handle, owner_op_type=OpType.SAMPLING)
 
-    def register_output_filepath(self, output_filepath):
-        c_output_filepath = get_c_name(output_filepath)
-        return ffc().flexflow_request_manager_register_output_filepath(
-            self.handle, c_output_filepath
-        )
+    def argmax(self, input, beam_search, name=None):
+        """Defines the Sampling layer.
 
-    def register_ssm_model(self, model):
-        return ffc().flexflow_request_manager_register_ssm_model(
-            self.handle, model.handle
-        )
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def set_max_requests_per_batch(self, max_requests):
-        return ffc().flexflow_request_manager_set_max_requests_per_batch(
-            self.handle, max_requests)
-    
-    def set_max_tokens_per_batch(self, max_tokens):
-        return ffc().flexflow_request_manager_set_max_tokens_per_batch(
-            self.handle, max_tokens)
-    
-    def set_max_sequence_length(self, max_length):
-        return ffc().flexflow_request_manager_set_max_sequence_length(
-            self.handle, max_length)
+        :param beam_search: Whether you need to perform beam search
+        :type beam_search: bool
 
-    def start_server(self, model):
-        return ffc().flexflow_request_manager_start_background_server(
-            self.handle, model.handle
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_argmax(
+            self.handle, input.handle, beam_search, c_name
         )
+        self.add_layer(OpType.ARGMAX, name)
+        return Tensor(handle, owner_op_type=OpType.ARGMAX)
 
-    def stop_server(self):
-        return ffc().flexflow_request_manager_terminate_background_server(
-            self.handle)
-# -----------------------------------------------------------------------
-# InferenceManager
-# -----------------------------------------------------------------------
+    def reset_metrics(self):
+        """Reset performance metrics.
 
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_reset_metrics(self.handle)
 
-class InferenceManager(object):
-    __slots__ = ["handle"]
+    def init_layers(self):
+        """Initialize layers.
 
-    def __init__(self):
-        self.handle = ffc().flexflow_inference_manager_get_inference_manager()
-        # self._handle = ffi.gc(self.handle, ffc().flexflow_inference_manager_destroy)
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_init_layers(self.handle)
 
-    def compile_model_and_allocate_buffer(self, model):
-        ffc().flexflow_inference_manager_compile_model_and_allocate_buffer(
-            self.handle, model.handle
-        )
+    def prefetch(self):
+        ffc().flexflow_model_prefetch(self.handle)
 
-    def init_operators_inference(self, model):
-        ffc().flexflow_inference_manager_init_operators_inference(
-            self.handle, model.handle
-        )
+    def forward(self, seq_length=None):
+        """Forward propagation of all layers.
 
-    def register_model_weights_loader(self, model, fileloader):
-        ffc().flexflow_inference_manager_register_model_weights_loader(
-            self.handle, model.handle, fileloader.handle
-        )
+        :returns:  None -- no returns.
+        """
+        if seq_length is None:
+            seq_length = -1
+        ffc().flexflow_model_forward(self.handle, seq_length)
 
-# -----------------------------------------------------------------------
-# FileDataLoader
-# -----------------------------------------------------------------------
+    # TODO: seperate compute_metrics from backward
+    def backward(self, seq_length=None):
+        """Backward propagation of all layers.
 
+        :returns:  None -- no returns.
+        """
+        if seq_length is None:
+            seq_length = -1
+        ffc().flexflow_model_backward(self.handle, seq_length)
 
-class FileDataLoader(object):
-    __slots__ = ["handle", "_handle"]
+    def compute_metrics(self):
+        """Compute performance metrics.
 
-    def __init__(
-        self,
-        weight_file_path,
-        num_q_heads,
-        num_kv_heads,
-        hidden_dim,
-        qkv_inner_dim,
-        tensor_parallelism_degree,
-        use_full_precision
-    ):
-        c_weight_file_path = get_c_name(weight_file_path)
-        self.handle = ffc().flexflow_file_data_loader_create(
-            c_weight_file_path,
-            num_q_heads,
-            num_kv_heads,
-            hidden_dim,
-            qkv_inner_dim,
-            tensor_parallelism_degree,
-            use_full_precision
-        )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy)
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_compute_metrics(self.handle)
 
-    def load_weights(self, model):
-        # Check data type and create use_full_precision boolean
-        #assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF
-        #use_full_precision = data_type == DataType.DT_FLOAT
-        ffc().flexflow_file_data_loader_load_weights(
-            self.handle, model.handle
-        )
+    def update(self):
+        """Update weights and biases of all layers.
 
-# -----------------------------------------------------------------------
-# GenerationConfig
-# -----------------------------------------------------------------------
-        
-class GenerationConfig(object):
-    """A class to store the sampling configs."""
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_update(self.handle)
 
-    def __init__(
-        self,
-        do_sample: bool = False,
-        temperature: float = 0.9,
-        topp: float = 0.8,
-        topk: int = 1,
-    ):
-        """Initialize the sampling configs
+    def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None):
+        """Configure the model for trainting. FlexFlow uses lazy initialization,
+        so the actual creating of all operations (including creating and partitioning
+        of weight, bias and output tensors) happen during compile.
 
-        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
-        :type do_sample: bool, optional
-        :param temperature: The temperature setting, defaults to 0.9
-        :type temperature: float, optional
-        :param topp: The top probabilities (top-p) setting, defaults to 0.8
-        :type topp: float, optional
-        :param topk: The top-k setting, defaults to 1
-        :type topk: int, optional
+        :param optimizer: optimizer instance.
+        :type optimizer: Optimizer
+
+        :param loss_type: Enum of LossType.
+          Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+          LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE.
+        :type loss_type: LossType
+
+        :param metrics: List of metrics to be evaluated by the model during training and testing.
+          Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY,
+          METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
+          METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR
+        :type metrics: MetricsType
+
+        :param comp_mode: Enum of CompMode.
+          Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE
+        :type comp_mode: CompMode
+
+        :returns:  None -- no returns.
         """
-        self.do_sample = do_sample
-        self.temperature = temperature
-        self.topp = topp
-        self.topk = topk
+        self.optimizer = optimizer
 
-# -----------------------------------------------------------------------
-# GenerationResult
-# -----------------------------------------------------------------------
+        c_loss_type = enum_to_int(LossType, loss_type)
+        metrics_int = []
+        for metric in metrics:
+            metrics_int.append(enum_to_int(MetricsType, metric))
+        c_metrics = ffi.new("int[]", metrics_int)
+        if comp_mode == None:
+            comp_mode = CompMode.TRAINING
+        c_comp_mode = enum_to_int(CompMode, comp_mode)
+        ffc().flexflow_model_compile(
+            self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode
+        )
+        for ff_tensor, np_tensor in self.attr_tensors.items():
+            ff_tensor.set_tensor(self, np_tensor)
+        print("Compiled ffmodel!")
 
-class GenerationResult(object):
-    """A class to store the output of a generation request."""
+    def fit(self, x=None, y=None, batch_size=None, epochs=1):
+        """Trains the model for a fixed number of epochs (iterations on a dataset).
 
-    def __init__(self, text: str = None, tokens: list = None):
-        self.output_text = text
-        self.output_tokens = tokens
+        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
+        :type x: Dataloader
 
-# -----------------------------------------------------------------------
-# LoraLinearConfig
-# -----------------------------------------------------------------------
-        
-class LoraLinearConfig(object):
-    __slots__ = ["handle", "_handle"]
+        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
+        :type y: Dataloader
 
-    def __init__(
-        self,
-        cache_folder,
-        peft_model_id,
-    ):
-        c_cache_folder = get_c_name(cache_folder)
-        peft_model_id = get_c_name(peft_model_id)
-        self.handle = ffc().flexflow_lora_linear_config_create(
-            c_cache_folder,
-            peft_model_id,
+        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
+          or :attr:`--batch-size` from the command line.
+        :type batch_size: int
+
+        :param epochs: Number of epochs to train the model.
+          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
+          The default value is 1.
+        :type epochs: int
+
+        :returns:  None -- no returns.
+        """
+        if isinstance(x, list) == False:
+            dataloaders = [x]
+        else:
+            dataloaders = x
+        dataloaders.append(y)
+
+        num_samples = y.num_samples
+        batch_size = self._ffconfig.batch_size
+        self._tracing_id += 1  # get a new tracing id
+        for epoch in range(0, epochs):
+            for d in dataloaders:
+                d.reset()
+            self.reset_metrics()
+            iterations = num_samples / batch_size
+            for iter in range(0, int(iterations)):
+                self._ffconfig.begin_trace(self._tracing_id)
+                for d in dataloaders:
+                    d.next_batch(self)
+                self.forward()
+                self.zero_gradients()
+                self.backward()
+                self.update()
+                self._ffconfig.end_trace(self._tracing_id)
+
+    def eval(self, x=None, y=None, batch_size=None):
+        """Returns the loss value & metrics values for the model in test mode.
+
+        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
+        :type x: Dataloader
+
+        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
+        :type y: Dataloader
+
+        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
+          or :attr:`--batch-size` from the command line.
+        :type batch_size: int
+
+        :param epochs: Number of epochs to train the model.
+          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
+          The default value is 1.
+        :type epochs: int
+
+        :returns:  None -- no returns.
+        """
+        if isinstance(x, list) == False:
+            dataloaders = [x]
+        else:
+            dataloaders = x
+        dataloaders.append(y)
+
+        num_samples = y.num_samples
+        batch_size = self._ffconfig.batch_size
+        for d in dataloaders:
+            d.reset()
+        self.reset_metrics()
+        iterations = num_samples / batch_size
+        self._tracing_id += 1  # get a new tracing id
+        for iter in range(0, int(iterations)):
+            for d in dataloaders:
+                d.next_batch(self)
+            self._ffconfig.begin_trace(self._tracing_id)
+            self.forward()
+            self.compute_metrics()
+            self._ffconfig.end_trace(self._tracing_id)
+
+    def zero_gradients(self):
+        """Empty the gradients of all layers.
+
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_zero_gradients(self.handle)
+
+    def set_optimizer(self, optimizer):
+        if isinstance(optimizer, SGDOptimizer) == True:
+            ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle)
+        elif isinstance(optimizer, AdamOptimizer) == True:
+            ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle)
+        elif optimizer == None:
+            pass
+        else:
+            assert 0, "[Model]: unknown optimizer"
+
+    optimizer = property(fset=set_optimizer)
+
+    def print_layers(self, id=-1):
+        ffc().flexflow_model_print_layers(self.handle, id)
+
+    def get_layer_by_id(self, layer_id):
+        return self._layers[layer_id]
+
+    def get_last_layer(self):
+        return self._layers[self._nb_layers - 1]
+
+    def get_layer_by_name(self, layer_name):
+        for layer_id in self._layers:
+            layer = self._layers[layer_id]
+            if layer.name == layer_name:
+                return layer
+        assert 0, f"Cannot find the layer with name {layer_name}"
+        return None
+
+    def get_tensor_by_id(self, id):
+        handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id)
+        return Parameter(handle)
+
+    @property
+    def label_tensor(self):
+        handle = ffc().flexflow_model_get_label_tensor(self.handle)
+        return Tensor(handle, deallocate=False)
+
+    def get_perf_metrics(self):
+        handle = ffc().flexflow_model_get_perf_metrics(self.handle)
+        return PerfMetrics(handle)
+
+    def set_transformer_layer_id(self, id):
+        ffc().flexflow_model_set_transformer_layer_id(self.handle, id)
+
+    def create_data_loader(self, batch_tensor, full_array):
+        """Create a SingleDataloader instance.
+
+        :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model.
+        :type batch_tensor: Tensor
+
+        :param full_array: the entire data.
+        :type full_array: Numpy Array
+
+        :returns:  SingleDataloader -- returns a dataloader instance.
+        """
+
+        if self._ffconfig.enable_control_replication:
+            assert (
+                self._ffconfig.python_data_loader_type != 1
+            ), "To enable control replication, please set --python-data-loader-type 2"
+            return self.__create_data_loader_ptr(batch_tensor, full_array)
+        else:
+            if self._ffconfig.python_data_loader_type == 1:
+                return self.__create_data_loader_attach(batch_tensor, full_array)
+            else:
+                return self.__create_data_loader_ptr(batch_tensor, full_array)
+
+    def __create_data_loader_attach(self, batch_tensor, full_array):
+        full_array_shape = full_array.shape
+        num_samples = full_array_shape[0]
+        num_dim = len(full_array_shape)
+        if full_array.dtype == "float16":
+            datatype = DataType.DT_HALF
+        elif full_array.dtype == "float32":
+            datatype = DataType.DT_FLOAT
+        elif full_array.dtype == "int32":
+            datatype = DataType.DT_INT32
+        elif full_array.dtype == "int64":
+            datatype = DataType.DT_INT64
+        else:
+            assert 0, "unsupported datatype"
+
+        if num_dim == 2:
+            full_tensor = self.create_tensor(
+                [num_samples, full_array_shape[1]], datatype
+            )
+            self.map_tensor(full_tensor)
+        elif num_dim == 4:
+            full_tensor = self.create_tensor(
+                [
+                    num_samples,
+                    full_array_shape[1],
+                    full_array_shape[2],
+                    full_array_shape[3],
+                ],
+                datatype,
+            )
+            self.map_tensor(full_tensor)
+        else:
+            assert 0, "unsupported dims"
+
+        full_tensor.attach_numpy_array(self._ffconfig, full_array)
+        dataloader = SingleDataLoader(
+            self, batch_tensor, full_tensor, num_samples, datatype
         )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_lora_linear_config_destroy)
+        full_tensor.detach_numpy_array(self._ffconfig)
 
-# -----------------------------------------------------------------------
-# PEFTModelID
-# -----------------------------------------------------------------------
-        
-class PEFTModelID(object):
-    __slots__ = ["handle", "_handle"]
+        return dataloader
 
-    def __init__(self, id=None):
-        if id is None:
-            self.handle = ffc().flexflow_peft_model_id_create()
+    def __create_data_loader_ptr(self, batch_tensor, full_array):
+        full_array_shape = full_array.shape
+        num_samples = full_array_shape[0]
+        if full_array.dtype == "float16":
+            datatype = DataType.DT_HALF
+        elif full_array.dtype == "float32":
+            datatype = DataType.DT_FLOAT
+        elif full_array.dtype == "int32":
+            datatype = DataType.DT_INT32
+        elif full_array.dtype == "int64":
+            datatype = DataType.DT_INT64
         else:
-            self.handle = ffc().flexflow_peft_model_id_create_id(id)
-        self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy)
+            assert 0, "unsupported datatype"
+        np_raw_ptr = full_array.__array_interface__["data"]
+        raw_ptr = ffi.cast("float*", np_raw_ptr[0])
+        print(
+            "numpy array: %s, %s, %s"
+            % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0]))
+        )
+        dataloader = SingleDataLoader(
+            self, batch_tensor, raw_ptr, num_samples, datatype
+        )
 
-# -----------------------------------------------------------------------
-# Request
-# -----------------------------------------------------------------------
-        
-class Request:
-    """A class to record the metadata of an inference or finetuning request."""
+        return dataloader
 
-    def __init__(self, req_type: RequestType, prompt: str = None, max_sequence_length: int = None, peft_model_id: PEFTModelID = None, dataset_filepath: str = None, max_training_steps: int = None):
-        self.req_type = req_type
-        self.prompt = prompt
-        self.max_sequence_length = max_sequence_length
-        self.peft_model_id = peft_model_id
-        self.dataset_filepath = dataset_filepath
-        self.max_training_steps = max_training_steps
\ No newline at end of file
+    def __get_initializer_handle(self, initializer):
+        if initializer == None:
+            null_initializer = Initializer(None)
+            return null_initializer.handle
+        else:
+            return initializer.handle
+
+    def __get_op_handle(self, shared_op):
+        if shared_op == None:
+            op_handle = ffi.new("flexflow_op_t *")
+            op_handle.impl = ffi.NULL
+            op = Op(op_handle[0])
+        else:
+            op = shared_op
+        return op.handle
+
+    def get_output_tensor(self, ffmodel, data_type):
+        shape = self.dims
+        if data_type == DataType.DT_HALF:
+            np_array = np.empty(shape, dtype=np.float16)
+        elif data_type == DataType.DT_FLOAT:
+            np_array = np.empty(shape, dtype=np.float32)
+        elif self.data_type == DataType.DT_INT32:
+            np_array = np.empty(shape, dtype=np.int32)
+        elif self.data_type == DataType.DT_INT64:
+            np_array = np.empty(shape, dtype=np.int64)
+        else:
+            assert 0, f"Unsupported datatype: {self.data_type}"
+        np_raw_ptr = np_array.__array_interface__["data"]
+        if np_array.dtype == np.float32:
+            raw_ptr = ffi.cast("float*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_float(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        elif np_array.dtype == np.int32:
+            raw_ptr = ffi.cast("int*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_int(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        elif np_array.dtype == np.int64:
+            raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_int64(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        fflogger.debug(
+            "get weights raw_ptr: %s, %s, %s, %s"
+            % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))
+        )
+        assert ret_val == True
+        return np_array
+
+    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128):
+        assert isinstance(prompt_list, list)
+        c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
+        max_num_chars = 5 * (max_sequence_length + 100)
+        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
+        c_output_length_and_tokens = [
+            ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list
+        ]
+        c_request_types = [
+            enum_to_int(RequestType, RequestType.REQ_INFERENCE)
+            for prompt in prompt_list
+        ]
+        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
+        peft_model_ids = [None for prompt in prompt_list]
+        dataset_filepaths = [None for prompt in prompt_list]
+        training_steps = [0 for prompt in prompt_list]
+        ffc().flexflow_model_generate(
+            self.handle,
+            len(prompt_list),
+            c_request_types,
+            c_input_texts,
+            c_output_texts,
+            max_sequence_lengths,
+            peft_model_ids,
+            dataset_filepaths,
+            training_steps,
+            c_output_length_and_tokens,
+        )
+        from flexflow.serve import GenerationResult
+
+        return [
+            GenerationResult(ffi.string(c_output_text), [])
+            for c_output_text in c_output_texts
+        ]
+
+    def generate(self, requests_list: List[Request]):
+        assert isinstance(requests_list, list)
+        c_input_texts = [
+            get_c_name(request.prompt) for request in requests_list
+        ]  # entry will be None for finetuning requests
+        c_output_texts = [
+            (
+                ffi.new("char[]", 5 * (request.max_sequence_length + 100))
+                if request.req_type == RequestType.REQ_INFERENCE
+                else ffi.NULL
+            )
+            for request in requests_list
+        ]
+        c_output_length_and_tokens = [
+            ffi.new("int[]", request.max_sequence_length + 100)
+            for request in requests_list
+        ]
+        c_request_types = [
+            enum_to_int(RequestType, request.req_type) for request in requests_list
+        ]
+        max_sequence_lengths = [
+            request.max_sequence_length for request in requests_list
+        ]
+        peft_model_ids = [request.peft_model_id for request in requests_list]
+        dataset_filepaths = [request.dataset_filepath for request in requests_list]
+        training_steps = [request.max_training_steps for request in requests_list]
+        ffc().flexflow_model_generate(
+            self.handle,
+            len(requests_list),
+            c_request_types,
+            c_input_texts,
+            c_output_texts,
+            max_sequence_lengths,
+            peft_model_ids,
+            dataset_filepaths,
+            training_steps,
+            c_output_length_and_tokens,
+        )
+        return [
+            (
+                GenerationResult(ffi.string(c_output_text), [])
+                if c_output_text != ffi.NULL
+                else None
+            )
+            for c_output_text in c_output_texts
+        ]
+
+    def set_position_offset(self, offset):
+        ffc().flexflow_model_set_position_offset(self.handle, offset)
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index da7dba5bcc..22f4779033 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -15,7 +15,16 @@
 from typing import Optional
 from ..type import *
 from flexflow.core import *
-from .serve import LLM, SSM, PEFT, GenerationConfig, GenerationResult
+from .serve import (
+    LLM,
+    SSM,
+    GenerationConfig,
+    GenerationResult,
+    LoraLinearConfig,
+    PEFTModelID,
+    Request,
+    RequestType,
+)
 
 
 def __check_positive_int(configs_dict: dict, key: str):
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index a02facc356..0960dcaf90 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -102,7 +102,7 @@ def __init__(
         self.refresh_cache = refresh_cache
         self.output_file = output_file
         self.rm = None
-        self.pefts = []
+        self.pefts = {}
 
     def __del__(self):
         # Stop the background server before deleting the object
@@ -123,7 +123,9 @@ def add_peft(self, peft_model_id: str):
             raise RuntimeError(
                 f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}"
             )
-        ff_peft_config = LoraLinearConfig(self.cache_path, peft_model_id)
+        ff_peft_config = LoraLinearConfig(
+            os.path.expanduser(self.cache_path), peft_model_id
+        )
         peft_dict = {
             "peft_config": peft_config,
             "peft_type": peft_type,
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 44fdd5af4e..cb8433c2c6 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1620,20 +1620,20 @@ void flexflow_model_generate(flexflow_model_t handle_,
                              int *max_seq_lengths,
                              flexflow_peft_model_id_t *peft_model_ids,
                              char const **dataset_filepaths,
-                             int *training_steps;
+                             int *training_steps,
                              int **output_length_and_tokens) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   std::vector<Request> requests;
 
-  int finetuning_req_idx = 0; 
+  int finetuning_req_idx = 0;
   for (int i = 0; i < num_requests; i++) {
     if (request_types[i] == RequestType::REQ_INFERENCE) {
       std::string const text_str(input_texts[i]);
       Request inference_req;
       inference_req.prompt = text_str;
       inference_req.max_sequence_length = max_seq_lengths[i];
-      if (peft_model_ids[i] != nullptr) {
-        PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+      PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+      if (peft_model_id != nullptr) {
         inference_req.peft_model_id = *peft_model_id;
       }
       requests.push_back(inference_req);
@@ -1646,14 +1646,14 @@ void flexflow_model_generate(flexflow_model_t handle_,
       Request fine_tuning_req;
       fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
       fine_tuning_req.max_sequence_length = max_seq_lengths[i];
-      if (peft_model_ids[i] != nullptr) {
-        PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+      PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+      if (peft_model_id != nullptr) {
         fine_tuning_req.peft_model_id = *peft_model_id;
       }
       std::string const dataset_fp(dataset_filepaths[finetuning_req_idx]);
       fine_tuning_req.dataset_filepath = dataset_fp;
       fine_tuning_req.max_training_steps = training_steps[finetuning_req_idx];
-      requests.push_back(finetuning_req_idx);
+      requests.push_back(fine_tuning_req);
       DEBUG_PRINT("[Model] generate[%d] %p %s %i %i",
                   i,
                   handle,
@@ -1668,11 +1668,11 @@ void flexflow_model_generate(flexflow_model_t handle_,
 
   for (int i = 0; i < num_requests; i++) {
     if (request_types[i] == RequestType::REQ_INFERENCE) {
-      // If the prompt exceeds max seq len, check that we return the prompt with no
-      // additional token. Otherwise, check that the output does not exceed the max
-      // sequence length.
-      assert(results[i].output_tokens.size() <= max_seq_length ||
-            results[i].output_tokens.size() == results[i].input_tokens.size());
+      // If the prompt exceeds max seq len, check that we return the prompt with
+      // no additional token. Otherwise, check that the output does not exceed
+      // the max sequence length.
+      assert(results[i].output_tokens.size() <= max_seq_lengths[i] ||
+             results[i].output_tokens.size() == results[i].input_tokens.size());
       output_length_and_tokens[i][0] = results[i].output_tokens.size();
       std::copy(results[i].output_tokens.begin(),
                 results[i].output_tokens.end(),
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 8fb040fb6d..0afde30c64 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -23,6 +23,7 @@
 #include <future>
 #include <iomanip>
 #include <new>
+#include <nlohmann/json.hpp>
 #include <stack>
 #include <stdexcept>
 
@@ -30,6 +31,7 @@ namespace FlexFlow {
 
 using namespace Legion;
 using tokenizers::Tokenizer;
+using json = nlohmann::json;
 
 LegionRuntime::Logger::Category log_req_mgr("RequestManager");
 
@@ -242,7 +244,7 @@ RequestManager::RequestGuid
   request.guid = next_available_guid++;
   request.max_sequence_length = request_.max_sequence_length;
   request.peft_model_id = request_.peft_model_id;
-  request.req_type = Request::REQ_FINETUNING;
+  request.req_type = RequestType::REQ_FINETUNING;
   request.completed_training_steps = 0;
   request.max_training_steps = request_.max_training_steps;
   request.dataset_filepath = request_.dataset_filepath;
@@ -385,7 +387,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     size_t guid =
         old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid;
     Request &request = all_requests[guid];
-    if (request.req_type == Request::REQ_FINETUNING) {
+    if (request.req_type == RequestType::REQ_FINETUNING) {
       // No new tokens generated when in fine-tuning mode
       continue;
     } else if (old_bc.tokensInfo[i].abs_depth_in_request + 1 <
@@ -415,7 +417,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
       Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
 
-      if (request.req_type == Request::REQ_FINETUNING) {
+      if (request.req_type == RequestType::REQ_FINETUNING) {
         // fine-tuning requests don't automatically carry over to the next
         // batch, we only do so if there is space left after adding new
         // inference requests
@@ -575,7 +577,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
         Request new_request = pending_infr_request_queue.front();
-        assert(new_request.req_type == Request::REQ_INFERENCE);
+        assert(new_request.req_type == RequestType::REQ_INFERENCE);
         pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
 
@@ -617,9 +619,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   // Step 4: add PEFT bwd requests, if there is additional space
   while (pending_peft_request_queue.size() > 0) {
     Request &request = pending_peft_request_queue.front();
-    assert(request.req_type = Request::REQ_FINETUNING);
+    assert(request.req_type = RequestType::REQ_FINETUNING);
     Request &all_req_handle = all_requests[request.guid];
-    assert(all_req_handle.req_type = Request::REQ_FINETUNING);
+    assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
     if (all_req_handle.status == Request::COMPLETED) {
       pending_peft_request_queue.pop();
     } else {
@@ -628,11 +630,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   }
   if (pending_peft_request_queue.size() > 0) {
     Request &request = pending_peft_request_queue.front();
-    assert(request.req_type = Request::REQ_FINETUNING);
+    assert(request.req_type = RequestType::REQ_FINETUNING);
     assert(request.dataset.size() > 0);
     // update status and training steps
     Request &all_req_handle = all_requests[request.guid];
-    assert(all_req_handle.req_type = Request::REQ_FINETUNING);
+    assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
     request.completed_training_steps = all_req_handle.completed_training_steps;
     request.status = all_req_handle.status;
     assert(request.status != Request::COMPLETED);
@@ -2424,7 +2426,7 @@ std::vector<GenerationResult>
   std::vector<RequestManager::RequestGuid> guids;
   for (int i = 0; i < requests.size(); i++) {
     RequestManager::RequestGuid guid;
-    if (requests.at(i).req_type == Request::REQ_INFERENCE) {
+    if (requests.at(i).req_type == RequestType::REQ_INFERENCE) {
       guid = rm->register_new_request(requests.at(i));
     } else {
       guid = rm->register_new_peft_request(requests.at(i));
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index 778b225a26..9b4a5204ac 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -25,4 +25,8 @@ export LEGION_BACKTRACE=1
 python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m 
 # if first time, add: --refresh-cache
 
+# CPP test
 ../build/inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft
+
+# Python test
+python ../inference/python/ff_peft.py

From c3e62d004dd2ca7d6ef71b64ac2cba1aa4f23539 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 25 Mar 2024 19:23:11 +0000
Subject: [PATCH 29/32] fix

---
 python/flexflow/serve/serve.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 0960dcaf90..63582038ac 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -413,7 +413,7 @@ def compile(
         # Add PEFT layer if registered
         for _, peft_dict in self.pefts.items():
             ff_peft_config = peft_dict["ff_peft_config"]
-            ff_peft_model_id = self.model.add_lora_layer(ff_peft_config)
+            ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config)
             peft_dict["ff_peft_model_id"] = ff_peft_model_id
 
         # Download the weights from huggingface (if needed)

From ce9803abacb050fcb1cf0fb360543dbc04cd96a5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 26 Mar 2024 19:16:42 +0000
Subject: [PATCH 30/32] fixes

---
 include/flexflow/flexflow_c.h         |  3 +++
 inference/python/ff_peft.py           | 12 +++++----
 python/flexflow/core/flexflow_cffi.py | 13 ++++++---
 python/flexflow/serve/serve.py        | 39 ++++++++++++++++++---------
 4 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 8150e05dd1..004523e875 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -595,6 +595,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
                                             bool beam_search,
                                             char const *name);
 
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+    flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
+
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
                                       flexflow_sgd_optimizer_t optimizer);
 
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index 18ef8bbf33..b5242945b6 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -54,7 +54,7 @@ def get_configs():
             "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
-            "enable_peft": False,
+            "enable_peft": True,
             "peft_activation_reserve_space_size": 1024,  # 1GB
             "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
@@ -121,17 +121,19 @@ def main():
     if len(configs.prompt) > 0:
         prompts = [s for s in json.load(open(configs.prompt))]
         inference_requests = [
-            Request(RequestType.REQ_INFERENCE, prompt=prompt, max_sequence_length=128)
+            ff.Request(
+                ff.RequestType.REQ_INFERENCE, prompt=prompt, max_sequence_length=128
+            )
             for prompt in prompts
         ]
         requests += inference_requests
     # Finetuning
     if len(configs.finetuning_dataset) > 0:
         for peft_model_id in configs.peft_model_ids:
-            finetuning_request = Request(
-                RequestType.REQ_FINETUNING,
+            finetuning_request = ff.Request(
+                ff.RequestType.REQ_FINETUNING,
                 max_sequence_length=128,
-                peft_model_id=peft_model_id,
+                peft_model_id=llm.get_ff_peft_id(peft_model_id),
                 dataset_filepath=configs.finetuning_dataset,
             )
             requests.append(finetuning_request)
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index aa762fc1af..82c3eb059c 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -1781,10 +1781,10 @@ def __init__(
         self,
         req_type: RequestType,
         prompt: str = None,
-        max_sequence_length: int = None,
+        max_sequence_length: int = 128,
         peft_model_id: PEFTModelID = None,
         dataset_filepath: str = None,
-        max_training_steps: int = None,
+        max_training_steps: int = 1,
     ):
         self.req_type = req_type
         self.prompt = prompt
@@ -4013,6 +4013,11 @@ def argmax(self, input, beam_search, name=None):
         self.add_layer(OpType.ARGMAX, name)
         return Tensor(handle, owner_op_type=OpType.ARGMAX)
 
+    def add_lora_layer(self, peft_config):
+        handle = ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle)
+        return handle
+        # self.add_layer(OpType.LORA, name)
+
     def reset_metrics(self):
         """Reset performance metrics.
 
@@ -4442,7 +4447,9 @@ def generate(self, requests_list: List[Request]):
             request.max_sequence_length for request in requests_list
         ]
         peft_model_ids = [request.peft_model_id for request in requests_list]
-        dataset_filepaths = [request.dataset_filepath for request in requests_list]
+        dataset_filepaths = [
+            get_c_name(request.dataset_filepath) for request in requests_list
+        ]
         training_steps = [request.max_training_steps for request in requests_list]
         ffc().flexflow_model_generate(
             self.handle,
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 63582038ac..a9efee341f 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -133,6 +133,18 @@ def add_peft(self, peft_model_id: str):
         }
         self.pefts[peft_model_id] = peft_dict
 
+    def get_ff_peft_id(self, peft_model_id: str) -> PEFTModelID:
+        if peft_model_id not in self.pefts:
+            raise ValueError(
+                f"PEFT {peft_model_id} not registered with LLM {self.model_name}"
+            )
+        peft_dict = self.pefts[peft_model_id]
+        if "ff_peft_model_id" not in peft_dict:
+            raise RuntimeError(
+                f"Attempting to run PEFT {peft_model_id} before compiling LLM {self.model_name}"
+            )
+        return peft_dict["ff_peft_model_id"]
+
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
         config_dir = os.path.join(
@@ -224,10 +236,9 @@ def get_hf_llm(model_name):
             )
 
         def download_llm_weights():
-            weights_path = get_weights_path(self.model_name)
             refresh_cache_if_needed(self.model_name)
             ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-                self.model_name, weights_path
+                self.model_name, self.weights_path
             )
             if ff_revision != latest_revision:
                 print(
@@ -235,7 +246,7 @@ def download_llm_weights():
                 )
                 hf_model = get_hf_llm(self.model_name)
                 # Convert the model to FlexFlow format
-                self.model_class.convert_hf_model(hf_model, weights_path)
+                self.model_class.convert_hf_model(hf_model, self.weights_path)
                 # Save new revision hash to file
                 with open(ff_revision_file, "w+") as f:
                     f.write(latest_revision)
@@ -257,7 +268,7 @@ def convert_peft_model(hf_peft_model, peft_type, weights_path):
         def download_peft_weights():
             for peft_model_id, peft_dict in self.pefts.items():
                 peft_config = peft_dict["peft_config"]
-                peft_type = peft_config["peft_type"]
+                peft_type = peft_dict["peft_type"]
 
                 weights_path = get_weights_path(peft_model_id)
                 refresh_cache_if_needed(peft_model_id)
@@ -285,6 +296,7 @@ def download_peft_weights():
                     gc.collect()
                     torch.cuda.empty_cache()
 
+        self.weights_path = get_weights_path(self.model_name)
         download_llm_weights()
         download_peft_weights()
 
@@ -295,24 +307,24 @@ def download_hf_tokenizer_if_needed(self):
         print("Loading tokenizer...")
 
         # Use local cache, or download new version
-        tokenizer_path = os.path.join(
+        self.tokenizer_path = os.path.join(
             os.path.expanduser(self.cache_path),
             "tokenizers",
             self.model_name.lower(),
         )
         if self.refresh_cache:
             print(
-                f"Refreshing cached tokenizer for model {self.model_name} at path {tokenizer_path} ..."
+                f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..."
             )
-            if os.path.exists(tokenizer_path):
-                shutil.rmtree(tokenizer_path)
-        if not os.path.exists(tokenizer_path):
-            print(f"Creating directory {tokenizer_path} (if it doesn't exist)...")
-            os.makedirs(tokenizer_path, exist_ok=True)
+            if os.path.exists(self.tokenizer_path):
+                shutil.rmtree(self.tokenizer_path)
+        if not os.path.exists(self.tokenizer_path):
+            print(f"Creating directory {self.tokenizer_path} (if it doesn't exist)...")
+            os.makedirs(self.tokenizer_path, exist_ok=True)
 
         # Get local revision SHA, check if it matches latest one on huggingface
         ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, tokenizer_path
+            self.model_name, self.tokenizer_path
         )
 
         if ff_revision != latest_revision:
@@ -327,7 +339,7 @@ def download_hf_tokenizer_if_needed(self):
             else:
                 hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
             # Save tokenizer
-            hf_tokenizer.save_pretrained(tokenizer_path)
+            hf_tokenizer.save_pretrained(self.tokenizer_path)
             print("Done updating HF tokenizer.")
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
@@ -490,6 +502,7 @@ def generate(
                     requests_or_prompts, max_length
                 )
             else:
+                print(requests_or_prompts)
                 return self.model.ffmodel.generate(requests_or_prompts)
         else:
             assert False, "Please pass a non-empty string or list of strings"

From b841789b3152961ff0bd6bfc7657720982664579 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 26 Mar 2024 22:10:59 +0000
Subject: [PATCH 31/32] fix

---
 python/flexflow/serve/serve.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index a9efee341f..538abe3858 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -123,13 +123,9 @@ def add_peft(self, peft_model_id: str):
             raise RuntimeError(
                 f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}"
             )
-        ff_peft_config = LoraLinearConfig(
-            os.path.expanduser(self.cache_path), peft_model_id
-        )
         peft_dict = {
             "peft_config": peft_config,
             "peft_type": peft_type,
-            "ff_peft_config": ff_peft_config,
         }
         self.pefts[peft_model_id] = peft_dict
 
@@ -158,12 +154,14 @@ def download_hf_config(self):
 
         # Save PEFT configs if the LLM has any registered PEFTs
         for peft_model_id, peft_dict in self.pefts.items():
-            peft_config = peft_dict["hf_config"]
-            peft_config_path = os.path.join(
+            peft_config = peft_dict["peft_config"]
+            peft_config_dir = os.path.join(
                 os.path.expanduser(self.cache_path),
                 "configs",
-                self.peft_model_id.lower(),
+                peft_model_id.lower(),
             )
+            os.makedirs(peft_config_dir, exist_ok=True)
+            peft_config_path = os.path.join(peft_config_dir, "config.json")
             print(f"Saving {peft_model_id} configs to file {peft_config_path}...")
             with open(peft_config_path, "w") as json_file:
 
@@ -423,8 +421,11 @@ def compile(
         )
 
         # Add PEFT layer if registered
-        for _, peft_dict in self.pefts.items():
-            ff_peft_config = peft_dict["ff_peft_config"]
+        for peft_model_id, peft_dict in self.pefts.items():
+            # ff_peft_config = peft_dict["ff_peft_config"]
+            ff_peft_config = LoraLinearConfig(
+                os.path.expanduser(self.cache_path), peft_model_id
+            )
             ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config)
             peft_dict["ff_peft_model_id"] = ff_peft_model_id
 

From c31f6b131e66229dd45ed3d563585579a3093c81 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 27 Mar 2024 17:10:19 +0000
Subject: [PATCH 32/32] fixes

---
 include/flexflow/request_manager.h        |  1 +
 inference/python/ff_peft.py               | 10 +++---
 python/flexflow/serve/models/falcon.py    |  3 +-
 python/flexflow/serve/models/llama.py     |  3 +-
 python/flexflow/serve/models/mpt.py       |  3 +-
 python/flexflow/serve/models/opt.py       |  3 +-
 python/flexflow/serve/models/starcoder.py |  3 +-
 src/runtime/request_manager.cc            | 43 +++++++++++++++++++++++
 8 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 0ef5efcf27..bf6e475cbb 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -84,6 +84,7 @@ struct Request {
   std::vector<std::pair<std::vector<BatchConfig::TokenId>,
                         std::vector<BatchConfig::TokenId>>>
       dataset;
+  friend std::ostream &operator<<(std::ostream &os, Request const &req);
 };
 
 // store the result of beam search
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index b5242945b6..38a25fb614 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -41,15 +41,15 @@ def get_configs():
         # Define sample configs
         ff_init_configs = {
             # required parameters
-            "num_gpus": 2,
-            "memory_per_gpu": 14000,
-            "zero_copy_memory_per_node": 40000,
+            "num_gpus": 1,
+            "memory_per_gpu": 8192,
+            "zero_copy_memory_per_node": 12000,
             # optional parameters
             "num_cpus": 4,
             "legion_utility_processors": 4,
             "data_parallelism_degree": 1,
             "tensor_parallelism_degree": 1,
-            "pipeline_parallelism_degree": 2,
+            "pipeline_parallelism_degree": 1,
             "offload": False,
             "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
@@ -58,7 +58,7 @@ def get_configs():
             "peft_activation_reserve_space_size": 1024,  # 1GB
             "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
-            "inference_debugging": False,
+            "inference_debugging": True,
             "fusion": True,
         }
         model_configs = {
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index e4d7786262..0176a1dda1 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -233,7 +233,8 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
-                output = ffmodel.argmax(lm_head, False)
+                softmax = ffmodel.softmax(lm_head, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 6aef540342..947878f706 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -241,7 +241,8 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(dense, 1, False)
-                output = ffmodel.argmax(dense, False)
+                softmax = ffmodel.softmax(dense, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 76f7d69c73..1d1837c478 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -244,7 +244,8 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(dense, -1)
             output = ffmodel.sampling(softmax, self.generation_config.topp)
         else:
-            output = ffmodel.argmax(lm_head, False)
+            softmax = ffmodel.softmax(lm_head, -1)
+            output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index f725a08e65..cde25f2241 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -273,7 +273,8 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
-                output = ffmodel.argmax(lm_head, False)
+                softmax = ffmodel.softmax(lm_head, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 8ed8fcfa18..80b4be10bb 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -212,7 +212,8 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(dense, -1)
             output = ffmodel.sampling(softmax, self.generation_config.topp)
         else:
-            output = ffmodel.argmax(lm_head, False)
+            softmax = ffmodel.softmax(lm_head, -1)
+            output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0afde30c64..9dc0361316 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -47,6 +47,48 @@ std::string LoadBytesFromFile(std::string const &path) {
   return data;
 }
 
+std::ostream &operator<<(std::ostream &os, Request const &req) {
+  os << "Request {\n";
+  os << "  guid: " << req.guid << "\n";
+  os << "  peft_model_id: " << req.peft_model_id << "\n";
+  os << "  max_sequence_length: " << req.max_sequence_length << "\n";
+  os << "  initial_len: " << req.initial_len << "\n";
+  os << "  ssm_cache_size: " << req.ssm_cache_size << "\n";
+  os << "  llm_cache_size: " << req.llm_cache_size << "\n";
+  os << "  status: " << static_cast<int>(req.status) << "\n";
+  os << "  tokens: [";
+  for (auto const &token : req.tokens) {
+    os << token << " ";
+  }
+  os << "]\n";
+  os << "  prompt: " << req.prompt << "\n";
+  // os << "  beam_trees: [";
+  // for (const auto& tree : req.beam_trees) {
+  //     // Assuming BeamTree has its own << operator defined
+  //     os << tree << " ";
+  // }
+  // os << "]\n";
+  os << "  req_type: " << static_cast<int>(req.req_type) << "\n";
+  os << "  completed_training_steps: " << req.completed_training_steps << "\n";
+  os << "  max_training_steps: " << req.max_training_steps << "\n";
+  os << "  dataset_filepath: " << req.dataset_filepath << "\n";
+  os << "  dataset: [";
+  for (auto const &pair : req.dataset) {
+    os << "[";
+    for (auto const &token : pair.first) {
+      os << token << " ";
+    }
+    os << "], [";
+    for (auto const &token : pair.second) {
+      os << token << " ";
+    }
+    os << "] ";
+  }
+  os << "]\n";
+  os << "}\n";
+  return os;
+}
+
 RequestManager::RequestManager()
     : request_manager_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
@@ -242,6 +284,7 @@ RequestManager::RequestGuid
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
+  request.initial_len = 0;
   request.max_sequence_length = request_.max_sequence_length;
   request.peft_model_id = request_.peft_model_id;
   request.req_type = RequestType::REQ_FINETUNING;