From b205d4dd989e46df93cd3ce65aceabdeeceda805 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 13 Feb 2024 12:36:22 +0000
Subject: [PATCH 01/55] upload hf model

---
 inference/utils/download_upload_hf.py | 49 +++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 inference/utils/download_upload_hf.py

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
new file mode 100644
index 0000000000..db0b131b8e
--- /dev/null
+++ b/inference/utils/download_upload_hf.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import argparse
+from huggingface_hub import HfApi, HfFolder
+import flexflow.serve as ff
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
+    parser.add_argument("model_name", type=str, help="Original Hugging Face model ID to download and process (e.g., 'facebook/opt-125m').")
+    parser.add_argument("--new-model-id", type=str, required=True, help="New Hugging Face Hub model ID for upload (e.g., 'your_username/new-model-name').")
+    parser.add_argument("--cache-folder", type=str, default="./model_cache", help="Folder to use to store and process the model(s) assets in FlexFlow format.")
+    parser.add_argument("--private", action="store_true", help="Whether to upload the processed model as a private model on Hugging Face Hub.")
+    parser.add_argument("--refresh-cache", action="store_true", help="Use this flag to force the refresh of the model(s) weights/tokenizer cache.")
+    parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights.")
+    return parser.parse_args()
+
+def download_and_process_model(model_name, cache_folder, refresh_cache, full_precision):
+    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
+    print(f"Downloading and processing model: {model_name}")
+    llm = ff.LLM(
+        model_name=model_name,
+        data_type=data_type,
+        cache_path=cache_folder,
+        refresh_cache=refresh_cache,
+    )
+    llm.download_hf_weights_if_needed()
+    llm.download_hf_tokenizer_if_needed()
+    llm.download_hf_config()
+    # any necessary conversion or processing by FlexFlow happens here
+
+def upload_processed_model_to_hub(new_model_id, cache_folder, private):
+    print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
+    api = HfApi()
+    if not HfFolder.get_token():
+        print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+        return
+    api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
+    api.upload_folder(folder_path=cache_folder, repo_id=new_model_id)
+    print("Upload completed successfully.")
+
+def main():
+    args = parse_args()
+    download_and_process_model(args.model_name, args.cache_folder, args.refresh_cache, args.full_precision)
+    upload_processed_model_to_hub(args.new_model_id, args.cache_folder, args.private)
+
+if __name__ == "__main__":
+    main()
+
+
+# python download_upload_hf.py facebook/opt-125m --new-model-id username/modelname --cache-folder ./model_cache --private
\ No newline at end of file

From 88f9311453a71591ed8b9a2398261ea0d03dadeb Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 13 Feb 2024 12:50:42 +0000
Subject: [PATCH 02/55] upload peft model

---
 inference/utils/download_upload_peft.py | 45 +++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 inference/utils/download_upload_peft.py

diff --git a/inference/utils/download_upload_peft.py b/inference/utils/download_upload_peft.py
new file mode 100644
index 0000000000..85d79f7f2a
--- /dev/null
+++ b/inference/utils/download_upload_peft.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+import argparse
+from huggingface_hub import HfApi, HfFolder
+import flexflow.serve as ff
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Download a PEFT model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
+    parser.add_argument("peft_model_id", type=str, help="Original Hugging Face PEFT model ID to download and process (e.g., 'username/peft-model').")
+    parser.add_argument("--new-model-id", type=str, required=True, help="New Hugging Face Hub model ID for upload (e.g., 'your_username/new-peft-model-name').")
+    parser.add_argument("--cache-folder", type=str, default="./peft_model_cache", help="Folder to use to store and process the PEFT model(s) assets in FlexFlow format.")
+    parser.add_argument("--private", action="store_true", help="Whether to upload the processed PEFT model as a private model on Hugging Face Hub.")
+    parser.add_argument("--refresh-cache", action="store_true", help="Use this flag to force the refresh of the PEFT model(s) weights/cache.")
+    parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights for the PEFT model.")
+    return parser.parse_args()
+
+def download_and_process_peft_model(peft_model_id, cache_folder, refresh_cache, full_precision):
+    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
+    print(f"Downloading and processing PEFT model: {peft_model_id}")
+    peft = ff.PEFT(
+        peft_model_id=peft_model_id,
+        data_type=data_type,
+        cache_path=cache_folder,
+        refresh_cache=refresh_cache,
+    )
+    peft.download_hf_weights_if_needed()
+    peft.download_hf_config()
+    # any necessary conversion or processing by FlexFlow happens here
+
+def upload_processed_peft_model_to_hub(new_model_id, cache_folder, private):
+    print(f"Uploading processed PEFT model to Hugging Face Hub: {new_model_id}")
+    api = HfApi()
+    if not HfFolder.get_token():
+        print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+        return
+    api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
+    api.upload_folder(folder_path=cache_folder, repo_id=new_model_id)
+    print("Upload completed successfully.")
+
+def main():
+    args = parse_args()
+    download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
+    upload_processed_peft_model_to_hub(args.new_model_id, args.cache_folder, args.private)
+
+if __name__ == "__main__":
+    main()

From 5459afa97b02b353134745a99e7c307bee9a2c66 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Fri, 16 Feb 2024 12:14:39 +0000
Subject: [PATCH 03/55] refactor uploading hf

---
 inference/utils/download_upload_hf.py | 20 ++++-----
 python/flexflow/serve/serve.py        | 58 +++++++++++++++++++++++++--
 2 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index db0b131b8e..83541e7894 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -3,6 +3,7 @@
 from huggingface_hub import HfApi, HfFolder
 import flexflow.serve as ff
 
+
 def parse_args():
     parser = argparse.ArgumentParser(description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
     parser.add_argument("model_name", type=str, help="Original Hugging Face model ID to download and process (e.g., 'facebook/opt-125m').")
@@ -13,6 +14,7 @@ def parse_args():
     parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights.")
     return parser.parse_args()
 
+
 def download_and_process_model(model_name, cache_folder, refresh_cache, full_precision):
     data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
     print(f"Downloading and processing model: {model_name}")
@@ -25,22 +27,20 @@ def download_and_process_model(model_name, cache_folder, refresh_cache, full_pre
     llm.download_hf_weights_if_needed()
     llm.download_hf_tokenizer_if_needed()
     llm.download_hf_config()
-    # any necessary conversion or processing by FlexFlow happens here
+    return llm
+
 
-def upload_processed_model_to_hub(new_model_id, cache_folder, private):
+def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
     print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
-    api = HfApi()
-    if not HfFolder.get_token():
-        print("Hugging Face token not found. Please login using `huggingface-cli login`.")
-        return
-    api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
-    api.upload_folder(folder_path=cache_folder, repo_id=new_model_id)
+    llm.upload_hf_model(new_model_id, private=private)
     print("Upload completed successfully.")
 
+
 def main():
     args = parse_args()
-    download_and_process_model(args.model_name, args.cache_folder, args.refresh_cache, args.full_precision)
-    upload_processed_model_to_hub(args.new_model_id, args.cache_folder, args.private)
+    llm = download_and_process_model(args.model_name, args.cache_folder, args.refresh_cache, args.full_precision)
+    upload_processed_model_to_hub(llm, args.new_model_id, args.cache_folder, args.private)
+
 
 if __name__ == "__main__":
     main()
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 3349809670..8714edb832 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -29,8 +29,8 @@
 from flexflow.core import *
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
 from peft import PeftModel, PeftConfig
-from huggingface_hub import HfApi
-import torch, shutil, hashlib, json, gc
+from huggingface_hub import HfApi, HfFolder, Repository
+import torch, shutil, hashlib, json, gc, os
 from typing import Union, List
 
 
@@ -136,7 +136,8 @@ def __init__(
         self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
         self.refresh_cache = refresh_cache
         self.output_file = output_file
-
+        
+        
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
         self.config_dir = os.path.join(
@@ -309,6 +310,23 @@ def __load_hf_weights(self):
         )
 
         self.fileloader.load_weights(self.model.ffmodel, self.data_type)
+        
+    def upload_hf_model(self, new_model_id: str, private: bool = False):
+        """
+        Uploads the model weights to the Hugging Face Hub.
+        
+        :param repo_id: The repository ID, including the organization/user and model name (e.g., "organization/model_name").
+        :param private: Whether to upload the model as a private model.
+        """
+        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
+        if not HfFolder.get_token():
+            print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+            return
+        api = HfApi()
+        api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
+        api.upload_folder(folder_path=self.cache_path, repo_id=new_model_id)
+        print("Upload completed successfully.")
+
 
     def compile(
         self,
@@ -657,3 +675,37 @@ def download_hf_weights_if_needed(self):
             torch.cuda.empty_cache()
         else:
             print(f"Loading '{self.peft_model_id}' model weights from the cache...")
+
+    def upload_model_to_hf(self, model_directory: str, model_id: str, private: bool = False):
+        """
+        Uploads the model from the specified directory to the Hugging Face Hub.
+
+        Args:
+        - model_directory (str): The directory where the model and its configuration are stored.
+        - model_id (str): The desired model ID on the Hugging Face Hub (e.g., "username/model_name").
+        - private (bool): If True, the model will be uploaded as a private model.
+        """
+        try:
+            # Check for Hugging Face CLI authentication
+            if not HfFolder.get_token():
+                raise ValueError("Hugging Face token not found. Please log in using `huggingface-cli login`.")
+            
+            # Ensure the specified directory contains model files
+            if not os.listdir(model_directory):
+                raise FileNotFoundError(f"No files found in {model_directory}. Please check the path and try again.")
+
+            # Create or get the repository
+            repo_url = HfApi().create_repo(name=model_id, private=private, exist_ok=True, use_auth_token=True)
+            print(f"Repository URL: {repo_url}")
+
+            # Initialize the repository, add files, commit, and push
+            repo = Repository(local_dir=model_directory, clone_from=repo_url, use_auth_token=True)
+            repo.git_add()
+            repo.git_commit("Upload model to Hugging Face Hub")
+            repo.git_push()
+
+            print(f"Model '{model_id}' successfully uploaded to the Hugging Face Hub.")
+        except Exception as e:
+            print(f"Failed to upload the model: {e}")
+        
+    
\ No newline at end of file

From 4b760ac82f4d708dd1547e4b7371c2129a7783a1 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Fri, 16 Feb 2024 12:31:23 +0000
Subject: [PATCH 04/55] refactor uploading peft

---
 inference/utils/download_upload_hf.py   |  2 +-
 inference/utils/download_upload_peft.py | 20 +++++-----
 python/flexflow/serve/serve.py          | 50 ++++++++++++-------------
 3 files changed, 33 insertions(+), 39 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index 83541e7894..c400355c0a 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -32,7 +32,7 @@ def download_and_process_model(model_name, cache_folder, refresh_cache, full_pre
 
 def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
     print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
-    llm.upload_hf_model(new_model_id, private=private)
+    llm.upload_hf_model(new_model_id, cache_folder, private=private)
     print("Upload completed successfully.")
 
 
diff --git a/inference/utils/download_upload_peft.py b/inference/utils/download_upload_peft.py
index 85d79f7f2a..6e174eacf7 100644
--- a/inference/utils/download_upload_peft.py
+++ b/inference/utils/download_upload_peft.py
@@ -25,21 +25,19 @@ def download_and_process_peft_model(peft_model_id, cache_folder, refresh_cache,
     peft.download_hf_weights_if_needed()
     peft.download_hf_config()
     # any necessary conversion or processing by FlexFlow happens here
-
-def upload_processed_peft_model_to_hub(new_model_id, cache_folder, private):
-    print(f"Uploading processed PEFT model to Hugging Face Hub: {new_model_id}")
-    api = HfApi()
-    if not HfFolder.get_token():
-        print("Hugging Face token not found. Please login using `huggingface-cli login`.")
-        return
-    api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
-    api.upload_folder(folder_path=cache_folder, repo_id=new_model_id)
+    return peft
+    
+    
+def upload_peft_model_to_hub(peft, new_model_id, cache_folder, private):
+    print(f"Uploading peft model to HuggingFace Hub: {new_model_id}")
+    peft.upload_hf_model(new_model_id, cache_folder, private=private)
     print("Upload completed successfully.")
+    
 
 def main():
     args = parse_args()
-    download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
-    upload_processed_peft_model_to_hub(args.new_model_id, args.cache_folder, args.private)
+    peft = download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
+    upload_peft_model_to_hub(peft, args.new_model_id, args.cache_folder, args.private)
 
 if __name__ == "__main__":
     main()
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 8714edb832..118943f04f 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -311,7 +311,7 @@ def __load_hf_weights(self):
 
         self.fileloader.load_weights(self.model.ffmodel, self.data_type)
         
-    def upload_hf_model(self, new_model_id: str, private: bool = False):
+    def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
         """
         Uploads the model weights to the Hugging Face Hub.
         
@@ -325,6 +325,7 @@ def upload_hf_model(self, new_model_id: str, private: bool = False):
         api = HfApi()
         api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
         api.upload_folder(folder_path=self.cache_path, repo_id=new_model_id)
+        # api.upload_folder(folder_path=model_path, repo_id=new_model_id)
         print("Upload completed successfully.")
 
 
@@ -676,36 +677,31 @@ def download_hf_weights_if_needed(self):
         else:
             print(f"Loading '{self.peft_model_id}' model weights from the cache...")
 
-    def upload_model_to_hf(self, model_directory: str, model_id: str, private: bool = False):
+    def process_and_upload_hf_model(self, model_id: str, private: bool = False):
         """
-        Uploads the model from the specified directory to the Hugging Face Hub.
+        Processes the PEFT model and uploads it to the Hugging Face Hub.
 
         Args:
-        - model_directory (str): The directory where the model and its configuration are stored.
         - model_id (str): The desired model ID on the Hugging Face Hub (e.g., "username/model_name").
         - private (bool): If True, the model will be uploaded as a private model.
         """
-        try:
-            # Check for Hugging Face CLI authentication
-            if not HfFolder.get_token():
-                raise ValueError("Hugging Face token not found. Please log in using `huggingface-cli login`.")
-            
-            # Ensure the specified directory contains model files
-            if not os.listdir(model_directory):
-                raise FileNotFoundError(f"No files found in {model_directory}. Please check the path and try again.")
-
-            # Create or get the repository
-            repo_url = HfApi().create_repo(name=model_id, private=private, exist_ok=True, use_auth_token=True)
-            print(f"Repository URL: {repo_url}")
-
-            # Initialize the repository, add files, commit, and push
-            repo = Repository(local_dir=model_directory, clone_from=repo_url, use_auth_token=True)
-            repo.git_add()
-            repo.git_commit("Upload model to Hugging Face Hub")
-            repo.git_push()
-
-            print(f"Model '{model_id}' successfully uploaded to the Hugging Face Hub.")
-        except Exception as e:
-            print(f"Failed to upload the model: {e}")
+        self.download_hf_weights_if_needed()
+        model_directory = self.weights_path 
+        self.upload_model_to_hf(model_directory, model_id, private)
+
+    def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
+        """
+        Uploads the processed PEFT model to the Hugging Face Hub.
         
-    
\ No newline at end of file
+        :param new_model_id: The new repository ID on Hugging Face Hub, including the organization/user and model name (e.g., "your_username/new-peft-model-name").
+        :param private: Whether to upload the model as a private model on Hugging Face Hub.
+        """
+        print(f"Uploading processed PEFT model to Hugging Face Hub: {new_model_id}")
+        if not HfFolder.get_token():
+            print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+            return
+        api = HfApi()
+        api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
+        api.upload_folder(folder_path=self.cache_path, repo_id=new_model_id)
+        # api.upload_folder(folder_path=model_path, repo_id=new_model_id)
+        print("Upload completed successfully.")

From cdf24eb0b9fd12f84fb5be0290141d7233ff22ed Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 19 Feb 2024 13:55:57 +0000
Subject: [PATCH 05/55] modify upload logic and add reconvert functions for opt
 models

---
 python/flexflow/serve/models/opt.py | 58 +++++++++++++++++++++++++++++
 python/flexflow/serve/serve.py      | 34 ++++++++++++++---
 2 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 4b0b613cca..1fdc269b85 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -15,6 +15,8 @@
 from flexflow.core import *
 from .base import FlexFlowModel
 import random, shutil
+import re
+import torch
 
 
 class OPTConfig:
@@ -301,3 +303,59 @@ def convert_hf_model(model, dst_folder):
             os.path.join(dst_folder, "embed_tokens_weight"),
             os.path.join(dst_folder, "embed_tokens_weight_lm_head"),
         )
+        
+    def convert_ff_weight_name(name):
+        # Reverse the previous conversion rules
+        converted_name = (
+            name.replace("wq", "q_proj")
+            .replace("wk", "k_proj")
+            .replace("wv", "v_proj")
+            .replace("wo", "out_proj")
+            .replace("attention", "self_attn")
+            .replace("add_bias_residual_layer_norm_attn_bias", "attention_wo_bias")
+            .replace("_add_bias_residual_layer_norm", "_final_layer_norm")
+            .replace("_bias", ".bias")
+            .replace("_weight", ".weight")
+            .replace("_bias", ".bias")
+        )
+        
+        converted_name = re.sub(r"layers_(\d+)_", r"layers.\1.", converted_name)
+        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
+        converted_name = re.sub(r"self_attn_(?!layer_norm)", "self_attn.", converted_name)
+        
+        # Prepend "model.decoder." to the weight name
+        converted_name = "model.decoder." + converted_name
+        
+        return converted_name
+
+
+    def load_weights_into_hf_model(model, src_folder):
+        """
+        Load weights from a specified folder and apply them to a Hugging Face model.
+        
+        Parameters:
+        - model: The instance of the Hugging Face model to load the weights into.
+        - src_folder: The path to the folder containing the weight files.
+        """
+        for file_name in os.listdir(src_folder):
+            weight_path = os.path.join(src_folder, file_name)
+            print("converting weight name: ", weight_path)
+            original_name = FlexFlowOPT.convert_ff_weight_name(file_name.replace('.bin', ''))
+            print("original name of the weights is: ", original_name)
+            
+            if not os.path.exists(weight_path):
+                raise FileNotFoundError(f"No weight file found for {file_name}")
+            
+            # weight_data = np.fromfile(weight_path, dtype=np.float32)
+            weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
+            if original_name not in model.state_dict():
+                raise KeyError(f"Parameter {original_name} not found in model.")
+            param = model.state_dict()[original_name]
+            
+            if weight_data.size != param.numel():
+                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+            
+            weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+            with torch.no_grad():
+                # Update the model's state dict directly since param.copy_ doesn't work on tensor slices or elements not in place
+                model.state_dict()[original_name].copy_(weight_tensor)
\ No newline at end of file
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 118943f04f..c3a674e180 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -313,19 +313,43 @@ def __load_hf_weights(self):
         
     def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
         """
-        Uploads the model weights to the Hugging Face Hub.
+        Uploads the model weights to the Hugging Face Hub, with reverse conversion of weights.
         
-        :param repo_id: The repository ID, including the organization/user and model name (e.g., "organization/model_name").
+        :param new_model_id: The new model ID for the Hugging Face Hub.
+        :param model_path: The path where the FlexFlow weights are stored.
         :param private: Whether to upload the model as a private model.
         """
-        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
+        print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
+        
+        # Initialize a new Hugging Face model instance
+        hf_model = AutoModelForCausalLM.from_config(self.hf_config)
+        weights_path = self.weights_path
+        
+        # Load FlexFlow weights into the Hugging Face model instance
+        try:
+            self.model_class.load_weights_into_hf_model(hf_model, weights_path)
+        except Exception as e:
+            print(f"Error loading weights into model: {e}")
+            return
+        
+        # Save the model with converted weights to a temporary directory
+        temp_dir = tempfile.mkdtemp()
+        hf_model.save_pretrained(temp_dir)
+        
+        # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")
             return
+        
+        # Upload the model
         api = HfApi()
+        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
         api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
-        api.upload_folder(folder_path=self.cache_path, repo_id=new_model_id)
-        # api.upload_folder(folder_path=model_path, repo_id=new_model_id)
+        api.upload_folder(folder_path=temp_dir, repo_id=new_model_id)
+        
+        # Cleanup temporary directory
+        shutil.rmtree(temp_dir)
+        
         print("Upload completed successfully.")
 
 

From 6101bbf2ab4779529b87c6b7ba87cc79906d93c9 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 20 Feb 2024 00:21:00 +0000
Subject: [PATCH 06/55] fix opt weight name converting issues

---
 python/flexflow/serve/models/opt.py | 19 +++++++++++--------
 python/flexflow/serve/serve.py      |  8 ++++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 1fdc269b85..cce277c1ea 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -307,21 +307,20 @@ def convert_hf_model(model, dst_folder):
     def convert_ff_weight_name(name):
         # Reverse the previous conversion rules
         converted_name = (
-            name.replace("wq", "q_proj")
+            name
+            .replace("add_bias_residual_layer_norm_attn_bias", "attention_wo_bias")
+            .replace("_add_bias_residual_layer_norm", "_final_layer_norm")
+            .replace("wq", "q_proj")
             .replace("wk", "k_proj")
             .replace("wv", "v_proj")
             .replace("wo", "out_proj")
             .replace("attention", "self_attn")
-            .replace("add_bias_residual_layer_norm_attn_bias", "attention_wo_bias")
-            .replace("_add_bias_residual_layer_norm", "_final_layer_norm")
-            .replace("_bias", ".bias")
-            .replace("_weight", ".weight")
-            .replace("_bias", ".bias")
         )
         
         converted_name = re.sub(r"layers_(\d+)_", r"layers.\1.", converted_name)
         converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
         converted_name = re.sub(r"self_attn_(?!layer_norm)", "self_attn.", converted_name)
+        converted_name = converted_name.replace("embed_tokens_weight_lm_head", "embed_tokens.weight")
         
         # Prepend "model.decoder." to the weight name
         converted_name = "model.decoder." + converted_name
@@ -340,8 +339,12 @@ def load_weights_into_hf_model(model, src_folder):
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
             print("converting weight name: ", weight_path)
-            original_name = FlexFlowOPT.convert_ff_weight_name(file_name.replace('.bin', ''))
-            print("original name of the weights is: ", original_name)
+            if weight_path.endswith("rev_sha.txt"):
+                print("skipping rev_sha.txt")
+                continue
+            else:
+                original_name = FlexFlowOPT.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print("original name of the weights is: ", original_name)
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index c3a674e180..8971277415 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -32,6 +32,8 @@
 from huggingface_hub import HfApi, HfFolder, Repository
 import torch, shutil, hashlib, json, gc, os
 from typing import Union, List
+import tempfile
+
 
 
 class GenerationConfig:
@@ -320,6 +322,7 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         :param private: Whether to upload the model as a private model.
         """
         print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
+        print("tokenizer path is: ", self.tokenizer_path)
         
         # Initialize a new Hugging Face model instance
         hf_model = AutoModelForCausalLM.from_config(self.hf_config)
@@ -336,6 +339,11 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         temp_dir = tempfile.mkdtemp()
         hf_model.save_pretrained(temp_dir)
         
+        # Copy the tokenizer files to the temporary directory
+        tokenizer_files = [f for f in os.listdir(self.tokenizer_path)]
+        for file_name in tokenizer_files:
+            shutil.copy(os.path.join(self.tokenizer_path, file_name), temp_dir)
+            
         # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")

From 90f5d676f1dd3a227d79b612bded9ff067698ead Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 20 Feb 2024 13:12:24 +0000
Subject: [PATCH 07/55] implement revert functions for falcon models

---
 python/flexflow/serve/models/falcon.py | 75 ++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index eafce814e1..5f4e757182 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -276,3 +276,78 @@ def convert_hf_model(model, dst_folder):
         model.lm_head.weight.detach().cpu().numpy().tofile(
             os.path.join(dst_folder, "lm_head_weight")
         )
+
+    
+    def convert_ff_weight_name(name):
+        
+        converted_name = name
+        converted_name = converted_name.replace("mlp_dense_h_to_4h", "mlp.dense_h_to_4h")
+        converted_name = converted_name.replace("mlp_dense_4h_to_h", "mlp.dense_4h_to_h")
+        converted_name = converted_name.replace("attention_wo", "self_attention.dense")
+        
+        converted_name = re.sub(r"layers_(\d+)_", r"transformer.h.\1.", converted_name)
+        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
+
+        return converted_name
+
+
+    def load_weights_into_hf_model(model, src_folder):
+        """
+        Load weights from a specified folder and apply them to a Hugging Face model.
+        
+        Parameters:
+        - model: The instance of the Hugging Face model to load the weights into.
+        - src_folder: The path to the folder containing the weight files.
+        - config: The configuration object for the model.
+        """
+        # Dictionary to hold the combined QKV weights
+        qkv_weights = {}
+        
+        for file_name in os.listdir(src_folder):
+            weight_path = os.path.join(src_folder, file_name)
+            print("converting weight file: ", weight_path)
+            original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
+            print("weight name after conversion: ", original_name)
+            
+            if not os.path.exists(weight_path):
+                raise FileNotFoundError(f"No weight file found for {file_name}")
+            
+            weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
+            
+            # Check if this is a Q, K, or V weight and combine them
+            if "attention_w" in original_name:
+                # Extract the type (Q, K, or V) and the layer number from the file name
+                qkv_type = re.search("(wq|wk|wv)", file_name).group(0)
+                layer_num = re.search("transformer.h.(\d+)", file_name).group(1)
+                
+                # Initialize the combined QKV weight if it doesn't exist
+                if layer_num not in qkv_weights:
+                    qkv_weights[layer_num] = np.zeros((3 * model.config.hidden_size, model.config.hidden_size))
+                
+                # Determine the position to place this weight in the combined QKV weight
+                type_index = {"wq": 0, "wk": 1, "wv": 2}[qkv_type]
+                qkv_weights[layer_num][type_index * model.config.hidden_size : (type_index + 1) * model.config.hidden_size] = weight_data
+                
+            elif original_name not in model.state_dict():
+                raise KeyError(f"Parameter {original_name} not found in model.")
+            else:
+                param = model.state_dict()[original_name]
+                if weight_data.size != param.numel():
+                    raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+                
+                weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+                with torch.no_grad():
+                    param.copy_(weight_tensor)
+        
+        # assign the combined QKV weights to the model
+        for layer_num, combined_weight_data in qkv_weights.items():
+            original_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+            
+            if original_name not in model.state_dict():
+                raise KeyError(f"Parameter {original_name} not found in model.")
+            
+            param = model.state_dict()[original_name]
+            combined_weight_tensor = torch.from_numpy(combined_weight_data).view(param.shape)
+            
+            with torch.no_grad():
+                param.copy_(combined_weight_tensor)

From 479190725313d0b4c28ca6738509e6c37ee0c70d Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Wed, 21 Feb 2024 12:04:55 +0000
Subject: [PATCH 08/55] upload method for peft class and falcon fixes

---
 python/flexflow/serve/models/falcon.py | 96 ++++++++++++++++----------
 python/flexflow/serve/serve.py         | 50 +++++++++-----
 2 files changed, 92 insertions(+), 54 deletions(-)

diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 5f4e757182..14f3eef82b 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -15,6 +15,7 @@
 from flexflow.core import *
 from .base import FlexFlowModel
 import random, torch
+import re
 
 
 class FalconConfig:
@@ -300,54 +301,79 @@ def load_weights_into_hf_model(model, src_folder):
         - src_folder: The path to the folder containing the weight files.
         - config: The configuration object for the model.
         """
-        # Dictionary to hold the combined QKV weights
+        
+        
+        print("Model hidden size:", model.config.hidden_size)
+        print("Model num_attention_heads:", model.config.num_attention_heads)
+        
+        hidden_size = model.config.hidden_size
+        num_attention_heads = model.config.num_attention_heads
+        hidden_size_per_head = hidden_size // num_attention_heads
+        intermediate_size = hidden_size * 4
+        
         qkv_weights = {}
         
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
             print("converting weight file: ", weight_path)
             original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
-            print("weight name after conversion: ", original_name)
+            print("weight name after conversion from flexflow: ", original_name)
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
             
             weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
             
-            # Check if this is a Q, K, or V weight and combine them
             if "attention_w" in original_name:
-                # Extract the type (Q, K, or V) and the layer number from the file name
-                qkv_type = re.search("(wq|wk|wv)", file_name).group(0)
-                layer_num = re.search("transformer.h.(\d+)", file_name).group(1)
-                
-                # Initialize the combined QKV weight if it doesn't exist
-                if layer_num not in qkv_weights:
-                    qkv_weights[layer_num] = np.zeros((3 * model.config.hidden_size, model.config.hidden_size))
-                
-                # Determine the position to place this weight in the combined QKV weight
-                type_index = {"wq": 0, "wk": 1, "wv": 2}[qkv_type]
-                qkv_weights[layer_num][type_index * model.config.hidden_size : (type_index + 1) * model.config.hidden_size] = weight_data
+                qkv_match = re.search("(wq|wk|wv)", file_name)
+                qkv_type = qkv_match.group(0) if qkv_match else None
+                layer_num_match = re.search(r"transformer.h.(\d+)", original_name)
+                layer_num = int(layer_num_match.group(1)) if layer_num_match else None
                 
-            elif original_name not in model.state_dict():
-                raise KeyError(f"Parameter {original_name} not found in model.")
+                if layer_num is not None:
+                    if layer_num not in qkv_weights:
+                        # For each layer, initialize space for Q, K, V weights for all heads
+                        # Each head has hidden_size_per_head elements, and there are num_attention_heads heads
+                        # For Q, K, V together, it's 3 * hidden_size_per_head * num_attention_heads
+                        qkv_shape = (3 * hidden_size_per_head * num_attention_heads, hidden_size)
+                        qkv_weights[layer_num] = np.zeros(qkv_shape)
+                    
+                    # Calculate index for Q, K, or V weight segment within the combined QKV weight
+                    type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
+                    offset = type_index * hidden_size_per_head * num_attention_heads
+                    # Reshape the weight data to fit into the combined QKV weight matrix
+                    reshaped_data = weight_data.reshape(-1, hidden_size)
+                    qkv_weights[layer_num][offset:offset+reshaped_data.shape[0], :] = reshaped_data
+            
+            elif "mlp.dense_h_to_4h" in original_name or "mlp.dense_4h_to_h" in original_name:
+                # Handle MLP weights
+                if "mlp.dense_h_to_4h" in original_name:
+                    total_elements = weight_data.size
+                    output_size = total_elements // hidden_size
+                    expected_shape = (output_size, hidden_size)
+                elif "mlp.dense_4h_to_h" in original_name:
+                    input_size = weight_data.size // hidden_size
+                    expected_shape = (hidden_size, input_size)  
+
+                if weight_data.size == np.prod(expected_shape):
+                    reshaped_weight_data = weight_data.reshape(expected_shape)
+                    if original_name in model.state_dict():
+                        param = model.state_dict()[original_name]
+                        param.data.copy_(torch.from_numpy(reshaped_weight_data))
+                else:
+                    raise ValueError(f"Cannot reshape weight {file_name} of size {weight_data.size} into expected shape {expected_shape}.")
             else:
-                param = model.state_dict()[original_name]
-                if weight_data.size != param.numel():
-                    raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
-                
-                weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
-                with torch.no_grad():
-                    param.copy_(weight_tensor)
+                # Handle other weights
+                if original_name in model.state_dict():
+                    param = model.state_dict()[original_name]
+                    print("trying to reshape: ", original_name)
+                    reshaped_data = weight_data.reshape(param.shape)
+                    param.data.copy_(torch.from_numpy(reshaped_data))
         
-        # assign the combined QKV weights to the model
-        for layer_num, combined_weight_data in qkv_weights.items():
-            original_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
-            
-            if original_name not in model.state_dict():
-                raise KeyError(f"Parameter {original_name} not found in model.")
-            
-            param = model.state_dict()[original_name]
-            combined_weight_tensor = torch.from_numpy(combined_weight_data).view(param.shape)
-            
-            with torch.no_grad():
-                param.copy_(combined_weight_tensor)
+        # Assign the combined QKV weights to the model, if applicable
+        for layer_num, weight in qkv_weights.items():
+            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+            if qkv_name in model.state_dict():
+                param = model.state_dict()[qkv_name]
+                # Ensure the combined weight is correctly reshaped to fit the model's expectations
+                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
\ No newline at end of file
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 8971277415..924d4d0c5d 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -315,7 +315,7 @@ def __load_hf_weights(self):
         
     def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
         """
-        Uploads the model weights to the Hugging Face Hub, with reverse conversion of weights.
+        Uploads the model to the Hugging Face Hub, with reverse conversion of weights.
         
         :param new_model_id: The new model ID for the Hugging Face Hub.
         :param model_path: The path where the FlexFlow weights are stored.
@@ -709,31 +709,43 @@ def download_hf_weights_if_needed(self):
         else:
             print(f"Loading '{self.peft_model_id}' model weights from the cache...")
 
-    def process_and_upload_hf_model(self, model_id: str, private: bool = False):
-        """
-        Processes the PEFT model and uploads it to the Hugging Face Hub.
-
-        Args:
-        - model_id (str): The desired model ID on the Hugging Face Hub (e.g., "username/model_name").
-        - private (bool): If True, the model will be uploaded as a private model.
-        """
-        self.download_hf_weights_if_needed()
-        model_directory = self.weights_path 
-        self.upload_model_to_hf(model_directory, model_id, private)
-
     def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
         """
-        Uploads the processed PEFT model to the Hugging Face Hub.
+        Uploads the PEFT model to the Hugging Face Hub, with reverse conversion of weights.
         
-        :param new_model_id: The new repository ID on Hugging Face Hub, including the organization/user and model name (e.g., "your_username/new-peft-model-name").
-        :param private: Whether to upload the model as a private model on Hugging Face Hub.
+        :param new_model_id: The new model ID for the Hugging Face Hub.
+        :param model_path: The path where the FlexFlow weights are stored.
+        :param private: Whether to upload the model as a private model.
         """
-        print(f"Uploading processed PEFT model to Hugging Face Hub: {new_model_id}")
+        print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
+        
+        # Initialize a new Hugging Face model instance
+        hf_model = AutoModelForCausalLM.from_config(self.hf_config)
+        weights_path = self.weights_path
+        
+        # Load FlexFlow weights into the Hugging Face model instance
+        try:
+            self.model_class.load_weights_into_hf_model(hf_model, weights_path)
+        except Exception as e:
+            print(f"Error loading weights into model: {e}")
+            return
+        
+        # Save the model with converted weights to a temporary directory
+        temp_dir = tempfile.mkdtemp()
+        hf_model.save_pretrained(temp_dir)
+        
+        # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")
             return
+        
+        # Upload the model
         api = HfApi()
+        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
         api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
-        api.upload_folder(folder_path=self.cache_path, repo_id=new_model_id)
-        # api.upload_folder(folder_path=model_path, repo_id=new_model_id)
+        api.upload_folder(folder_path=temp_dir, repo_id=new_model_id)
+        
+        # Cleanup temporary directory
+        shutil.rmtree(temp_dir)
+        
         print("Upload completed successfully.")

From 48ef4553d50df55d0c06d79eea802a61bfb1aed8 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Fri, 23 Feb 2024 00:22:50 +0000
Subject: [PATCH 09/55] reconvert functions for llama models

---
 python/flexflow/serve/models/llama.py | 67 +++++++++++++++++++++++++++
 python/flexflow/serve/serve.py        |  5 ++
 2 files changed, 72 insertions(+)

diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index ba5f1df7a2..0c25e36df5 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -15,6 +15,10 @@
 from flexflow.core import *
 from .base import FlexFlowModel
 import random
+import re
+import os
+import numpy as np
+import torch
 
 
 class LLAMAConfig:
@@ -266,3 +270,66 @@ def convert_hf_model(model, dst_folder):
         for name, params in model.named_parameters():
             name = FlexFlowLLAMA.convert_hf_weight_name(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
+
+
+    def convert_ff_weight_name(name):
+        converted_name = (
+            name
+            .replace("w1", "gate_proj")
+            .replace("w2", "down_proj")
+            .replace("w3", "up_proj")
+            .replace("wq", "q_proj")
+            .replace("wk", "k_proj")
+            .replace("wv", "v_proj")
+            .replace("wo", "o_proj")
+            .replace("feed_forward_", "mlp.")
+            .replace("self_attn", "attention")
+            .replace("attention_norm", "input_layernorm")
+            .replace("tok_embeddings", "embed_tokens")
+            .replace("output", "lm_head")
+            
+        )
+        
+        converted_name = re.sub(r"layers_(\d+)_", r"layers.\1.", converted_name)
+        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
+        converted_name = re.sub(r"attention_(?!norm)", "self_attn.", converted_name)
+        
+        converted_name = converted_name.replace("ffn_norm", "post_attention_layernorm")
+            
+        if "lm_head" not in converted_name:
+            converted_name = "model." + converted_name   
+                 
+        return converted_name
+    
+    
+    def load_weights_into_hf_model(model, src_folder):
+        """
+        Load weights from a specified folder and apply them to a Hugging Face model.
+
+        Parameters:
+        - model: The instance of the Hugging Face model to load weights into.
+        - src_folder: The path to the folder containing the weight files.
+        """
+        for file_name in os.listdir(src_folder):
+            weight_path = os.path.join(src_folder, file_name)
+            if weight_path.endswith("rev_sha.txt"):
+                print("skipping rev_sha.txt")
+                continue
+            else:
+                original_name = FlexFlowLLAMA.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print(f"Converting weight name: {file_name} to {original_name}")
+            
+            if not os.path.exists(weight_path):
+                raise FileNotFoundError(f"No weight file found for {file_name}")
+            
+            weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
+            if original_name not in model.state_dict():
+                raise KeyError(f"Parameter {original_name} not found in model.")
+            param = model.state_dict()[original_name]
+            
+            if weight_data.size != param.numel():
+                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+            
+            weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+            with torch.no_grad():
+                model.state_dict()[original_name].copy_(weight_tensor)
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 924d4d0c5d..c8b4761891 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -344,6 +344,11 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         for file_name in tokenizer_files:
             shutil.copy(os.path.join(self.tokenizer_path, file_name), temp_dir)
             
+        # Delete rev_sha.txt from the temporary directory if it exists
+        rev_sha_path = os.path.join(temp_dir, 'rev_sha.txt')
+        if os.path.exists(rev_sha_path):
+            os.remove(rev_sha_path)
+            
         # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")

From 72e1556a7d38e8aa5b55fefc87fac1ad2bc260a0 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Sat, 24 Feb 2024 10:48:42 +0000
Subject: [PATCH 10/55] finish weight convert for falcon models

---
 python/flexflow/serve/models/falcon.py | 109 +++++++++++++------------
 1 file changed, 58 insertions(+), 51 deletions(-)

diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 14f3eef82b..4b61d4989b 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -285,7 +285,8 @@ def convert_ff_weight_name(name):
         converted_name = converted_name.replace("mlp_dense_h_to_4h", "mlp.dense_h_to_4h")
         converted_name = converted_name.replace("mlp_dense_4h_to_h", "mlp.dense_4h_to_h")
         converted_name = converted_name.replace("attention_wo", "self_attention.dense")
-        
+        if name.startswith("ln") or name.startswith("word_embeddings"):
+            converted_name = "transformer." + converted_name
         converted_name = re.sub(r"layers_(\d+)_", r"transformer.h.\1.", converted_name)
         converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
 
@@ -302,78 +303,84 @@ def load_weights_into_hf_model(model, src_folder):
         - config: The configuration object for the model.
         """
         
+        hidden_size = model.config.hidden_size
+        n_head = (
+            model.config.n_head
+            if "n_head" in model.config.__dict__
+            else model.config.num_attention_heads
+        )
         
-        print("Model hidden size:", model.config.hidden_size)
-        print("Model num_attention_heads:", model.config.num_attention_heads)
+        print("Model hidden size:", hidden_size)
+        print("Model num_attention_heads:", n_head)
         
-        hidden_size = model.config.hidden_size
-        num_attention_heads = model.config.num_attention_heads
-        hidden_size_per_head = hidden_size // num_attention_heads
+        num_attention_heads = n_head
+        hidden_size_per_head = hidden_size // n_head
         intermediate_size = hidden_size * 4
         
         qkv_weights = {}
         
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
-            print("converting weight file: ", weight_path)
+            print("\nProcessing weight file:", weight_path)
             original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
-            print("weight name after conversion from flexflow: ", original_name)
+            print("Converted weight name:", original_name)
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
             
             weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
+            print(f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}")
             
-            if "attention_w" in original_name:
+            # for q,k,v weights, store in dict
+            if ("attention_wq" in original_name) or ("attention_wk" in original_name) or ("attention_wv" in original_name):
                 qkv_match = re.search("(wq|wk|wv)", file_name)
                 qkv_type = qkv_match.group(0) if qkv_match else None
                 layer_num_match = re.search(r"transformer.h.(\d+)", original_name)
                 layer_num = int(layer_num_match.group(1)) if layer_num_match else None
+                print(f"QKV type: {qkv_type}, Layer number: {layer_num}")
                 
                 if layer_num is not None:
                     if layer_num not in qkv_weights:
-                        # For each layer, initialize space for Q, K, V weights for all heads
-                        # Each head has hidden_size_per_head elements, and there are num_attention_heads heads
-                        # For Q, K, V together, it's 3 * hidden_size_per_head * num_attention_heads
-                        qkv_shape = (3 * hidden_size_per_head * num_attention_heads, hidden_size)
+                        # qkv_shape = (hidden_size_per_head * num_attention_heads, hidden_size)
+                        per_type_space = hidden_size_per_head * n_head
+                        
+                        qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+                        if qkv_name in model.state_dict():
+                            qkv_param_size = model.state_dict()[qkv_name].shape[0]
+                        qkv_shape = (qkv_param_size, hidden_size)
                         qkv_weights[layer_num] = np.zeros(qkv_shape)
-                    
-                    # Calculate index for Q, K, or V weight segment within the combined QKV weight
+                        print(f"Initialized QKV shape for layer {layer_num}: {qkv_shape}")
+                        
                     type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
-                    offset = type_index * hidden_size_per_head * num_attention_heads
-                    # Reshape the weight data to fit into the combined QKV weight matrix
-                    reshaped_data = weight_data.reshape(-1, hidden_size)
-                    qkv_weights[layer_num][offset:offset+reshaped_data.shape[0], :] = reshaped_data
+                    offset = type_index * per_type_space
+                    print("offset for this weight is: ", offset)
+                    ## dim 0 sizes: 
+                    dim_wq = hidden_size
+                    dim_wk = hidden_size // n_head
+                    dim_wv = hidden_size // n_head
+                    print(dim_wq, dim_wk, dim_wv)
+                    
+                    try:
+                        expected_shape = (weight_data.size // hidden_size, hidden_size)
+                        reshaped_data = weight_data.reshape(expected_shape)
+                        print(f"Reshaped QKV weights for {qkv_type} in layer {layer_num} with shape {expected_shape}.")
+                    except ValueError as e:
+                        print(f"Error reshaping {qkv_type} weights for layer {layer_num}: {e}")
+                        print(f"Attempting to reshape data of size {weight_data.size} into shape (-1, {hidden_size})")
+                        
+                        
+                    try:
+                        if qkv_type == "wq":
+                            qkv_weights[layer_num][0:dim_wq, :] = reshaped_data
+                        elif qkv_type == "wk":
+                            qkv_weights[layer_num][dim_wq:dim_wk+dim_wq, :] = reshaped_data
+                        else:
+                            qkv_weights[layer_num][dim_wq+dim_wk:, :] = reshaped_data
+                    except ValueError as e:
+                        print(f"Error assigning {qkv_type} weights for layer {layer_num}: {e}")
+                continue
             
-            elif "mlp.dense_h_to_4h" in original_name or "mlp.dense_4h_to_h" in original_name:
-                # Handle MLP weights
-                if "mlp.dense_h_to_4h" in original_name:
-                    total_elements = weight_data.size
-                    output_size = total_elements // hidden_size
-                    expected_shape = (output_size, hidden_size)
-                elif "mlp.dense_4h_to_h" in original_name:
-                    input_size = weight_data.size // hidden_size
-                    expected_shape = (hidden_size, input_size)  
-
-                if weight_data.size == np.prod(expected_shape):
-                    reshaped_weight_data = weight_data.reshape(expected_shape)
-                    if original_name in model.state_dict():
-                        param = model.state_dict()[original_name]
-                        param.data.copy_(torch.from_numpy(reshaped_weight_data))
-                else:
-                    raise ValueError(f"Cannot reshape weight {file_name} of size {weight_data.size} into expected shape {expected_shape}.")
-            else:
-                # Handle other weights
-                if original_name in model.state_dict():
-                    param = model.state_dict()[original_name]
-                    print("trying to reshape: ", original_name)
-                    reshaped_data = weight_data.reshape(param.shape)
-                    param.data.copy_(torch.from_numpy(reshaped_data))
-        
-        # Assign the combined QKV weights to the model, if applicable
-        for layer_num, weight in qkv_weights.items():
-            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
-            if qkv_name in model.state_dict():
-                param = model.state_dict()[qkv_name]
-                # Ensure the combined weight is correctly reshaped to fit the model's expectations
-                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
\ No newline at end of file
+            # for weights that are not q,k,v, get the param names
+            param = model.state_dict().get(original_name, None)
+            if param is None:
+                print(f"Warning: {original_name} not found i
\ No newline at end of file

From fffe25a3c67d5c8006ccd668bc0a8a51017da9a8 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 26 Feb 2024 13:16:10 +0000
Subject: [PATCH 11/55] simplify upload script

---
 inference/utils/download_upload_hf.py | 29 +++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index c400355c0a..f4e6db7775 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -8,7 +8,7 @@ def parse_args():
     parser = argparse.ArgumentParser(description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
     parser.add_argument("model_name", type=str, help="Original Hugging Face model ID to download and process (e.g., 'facebook/opt-125m').")
     parser.add_argument("--new-model-id", type=str, required=True, help="New Hugging Face Hub model ID for upload (e.g., 'your_username/new-model-name').")
-    parser.add_argument("--cache-folder", type=str, default="./model_cache", help="Folder to use to store and process the model(s) assets in FlexFlow format.")
+    parser.add_argument("--cache-folder", type=str, default="~/.cache/flexflow", help="Folder to use to store and process the model(s) assets in FlexFlow format.")
     parser.add_argument("--private", action="store_true", help="Whether to upload the processed model as a private model on Hugging Face Hub.")
     parser.add_argument("--refresh-cache", action="store_true", help="Use this flag to force the refresh of the model(s) weights/tokenizer cache.")
     parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights.")
@@ -37,13 +37,30 @@ def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
 
 
 def main():
-    args = parse_args()
-    llm = download_and_process_model(args.model_name, args.cache_folder, args.refresh_cache, args.full_precision)
-    upload_processed_model_to_hub(llm, args.new_model_id, args.cache_folder, args.private)
+    model_name = "mosaicml/mpt-7b"
+    # new_model_id = "your_username/new-model-name"
+    new_model_id = "aprilyyt/upload-mpt"  
+    cache_folder = "~/.cache/flexflow"
+    private = True 
+    refresh_cache = False
+    full_precision = True 
 
+    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
+    print(f"Downloading and processing model: {model_name}")
+    llm = ff.LLM(
+        model_name=model_name,
+        data_type=data_type,
+        cache_path=cache_folder,
+        refresh_cache=refresh_cache,
+    )
+    llm.download_hf_weights_if_needed()
+    llm.download_hf_tokenizer_if_needed()
+    llm.download_hf_config()
+
+    print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
+    llm.upload_hf_model(new_model_id, cache_folder, private=private)
+    print("Upload completed successfully.")
 
 if __name__ == "__main__":
     main()
 
-
-# python download_upload_hf.py facebook/opt-125m --new-model-id username/modelname --cache-folder ./model_cache --private
\ No newline at end of file

From 69c53c4ce1baf97627f1589b0d6f78727afea4ff Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 26 Feb 2024 13:17:10 +0000
Subject: [PATCH 12/55] fix falcon typo

---
 python/flexflow/serve/models/falcon.py | 31 +++++++++++++++++---------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 4b61d4989b..87b3b64bcd 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -322,8 +322,12 @@ def load_weights_into_hf_model(model, src_folder):
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
             print("\nProcessing weight file:", weight_path)
-            original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
-            print("Converted weight name:", original_name)
+            if weight_path.endswith("rev_sha.txt"):
+                print("skipping rev_sha.txt")
+                continue
+            else:
+                original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print("Converted weight name:", original_name)
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
@@ -341,8 +345,6 @@ def load_weights_into_hf_model(model, src_folder):
                 
                 if layer_num is not None:
                     if layer_num not in qkv_weights:
-                        # qkv_shape = (hidden_size_per_head * num_attention_heads, hidden_size)
-                        per_type_space = hidden_size_per_head * n_head
                         
                         qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
                         if qkv_name in model.state_dict():
@@ -352,13 +354,10 @@ def load_weights_into_hf_model(model, src_folder):
                         print(f"Initialized QKV shape for layer {layer_num}: {qkv_shape}")
                         
                     type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
-                    offset = type_index * per_type_space
-                    print("offset for this weight is: ", offset)
                     ## dim 0 sizes: 
                     dim_wq = hidden_size
                     dim_wk = hidden_size // n_head
                     dim_wv = hidden_size // n_head
-                    print(dim_wq, dim_wk, dim_wv)
                     
                     try:
                         expected_shape = (weight_data.size // hidden_size, hidden_size)
@@ -368,7 +367,6 @@ def load_weights_into_hf_model(model, src_folder):
                         print(f"Error reshaping {qkv_type} weights for layer {layer_num}: {e}")
                         print(f"Attempting to reshape data of size {weight_data.size} into shape (-1, {hidden_size})")
                         
-                        
                     try:
                         if qkv_type == "wq":
                             qkv_weights[layer_num][0:dim_wq, :] = reshaped_data
@@ -380,7 +378,20 @@ def load_weights_into_hf_model(model, src_folder):
                         print(f"Error assigning {qkv_type} weights for layer {layer_num}: {e}")
                 continue
             
-            # for weights that are not q,k,v, get the param names
+            # for weights that are not q,k,v, get the param names and assign weights accordingly
             param = model.state_dict().get(original_name, None)
             if param is None:
-                print(f"Warning: {original_name} not found i
\ No newline at end of file
+                print(f"Warning: {original_name} not found in directory")
+            reshaped_data = weight_data.reshape(param.shape)
+            param.data.copy_(torch.from_numpy(reshaped_data))
+            
+        # Assign the combined QKV weights to the model
+        for layer_num, weight in qkv_weights.items():
+            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+            if qkv_name in model.state_dict():
+                param = model.state_dict()[qkv_name]
+                # Ensure the combined weight is correctly reshaped to fit the model's expectations
+                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
+            
+            
+            
\ No newline at end of file

From e51004f23017bd9f81af622d0281728e16617f0b Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 26 Feb 2024 13:17:47 +0000
Subject: [PATCH 13/55] mpt models, minor errs to be fixed

---
 python/flexflow/serve/models/mpt.py | 94 ++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 91d87669ca..3678341ca3 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -14,7 +14,8 @@
 
 from flexflow.core import *
 from .base import FlexFlowModel
-import random, torch, shutil
+import random, torch, shutil, os, re
+import numpy as np
 
 
 class MPTConfig:
@@ -280,3 +281,94 @@ def convert_hf_model(model, dst_folder):
             os.path.join(dst_folder, "transformer_wte_weight"),
             os.path.join(dst_folder, "lm_head_weight"),
         )
+
+
+    def convert_ff_weight_name(name):
+        # Reverses the conversion logic for MPT model weights
+        converted_name = name
+        if "norm_f" in converted_name or "wte" in converted_name:
+            converted_name = converted_name.replece("_", ".")
+            
+        converted_name = converted_name.replace("attention_wo", "attn.out_proj")
+        converted_name = converted_name.replace("ffn_", "ffn.")
+        converted_name = re.sub(r"layers_(\d+)_", r"transformer.blocks.\1.", converted_name)
+        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
+        
+        return converted_name
+
+    def load_weights_into_hf_model(model, src_folder):
+        """
+        Load weights from a specified folder and apply them to a Hugging Face MPT model.
+        
+        Parameters:
+        - model: The instance of the Hugging Face model to load the weights into.
+        - src_folder: The path to the folder containing the weight files.
+        """
+        
+        d_model = model.config.d_model
+        print("dimension of the model is: ", d_model)
+        
+        qkv_weights = {}
+        
+        for file_name in os.listdir(src_folder):
+            weight_path = os.path.join(src_folder, file_name)
+            if weight_path.endswith("rev_sha.txt"):
+                print("skipping rev_sha.txt")
+                continue
+            elif "lm_head" in weight_path:
+                # todo: double check how to handle lm_head in uploading mpt models
+                print("skipping lm_head.weight")
+                continue
+            else:
+                original_name = FlexFlowMPT.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print("\nconverting weights name of: ", file_name, "to ", original_name)
+                
+            if not os.path.exists(weight_path):
+                raise FileNotFoundError(f"No weight file found for {file_name}")
+            
+            weight_data = np.fromfile(weight_path, dtype=np.float32)
+            
+            # Special handling for combined QKV weights
+            if ("wq" in file_name) or ("wk" in file_name) or ("wv" in file_name):
+                layer_num_match = re.search(r"layers\.(\d+)", original_name)
+                layer_num = int(layer_num_match.group(1)) if layer_num_match else None
+                qkv_type = original_name.split("_")[-2]
+                
+                if layer_num is not None:
+                    qkv_key = f"layers.{layer_num}.attn_Wqkv"
+                    # initialize qkv layer in dict
+                    if qkv_key not in qkv_weights:
+                        qkv_weights[qkv_key] = {'wq': None, 'wk': None, 'wv': None}
+                        print(f"Initialized QKV layer {layer_num}")
+                    # assign weights into dict
+                    qkv_weights[qkv_key][qkv_type] = weight_data
+                
+                continue
+            
+            # for weights that are not q,k,v, get the param names and assign weights accordingly
+            param = model.state_dict().get(original_name, None)
+            if weight_data.size != param.numel():
+                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+            
+            weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+            with torch.no_grad():
+                model.state_dict()[original_name].copy_(weight_tensor)
+
+                    
+        for qkv_key, weights_dict in qkv_weights.items():
+            wq, wk, wv = weights_dict['wq'], weights_dict['wk'], weights_dict['wv']
+            if None in (wq, wk, wv):
+                raise ValueError(f"Missing weights for {qkv_key}")
+
+            combined_qkv = np.concatenate([wq, wk, wv], axis=0)
+            qkv_name = qkv_key.replace("layers.", "transformer.blocks.")+".weight"
+            
+            param_shape = model.state_dict()[qkv_name].shape
+            combined_qkv_reshaped = combined_qkv.reshape(param_shape)
+
+            model.state_dict()[qkv_name].copy_(torch.from_numpy(combined_qkv_reshaped))
+
+            print(f"Assigned combined QKV weights to {qkv_key}.")
+                
+                
+                
\ No newline at end of file

From f32000a148f8ff722a752858b854ae0a4fc35989 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 26 Feb 2024 13:22:22 +0000
Subject: [PATCH 14/55] starcoder models, minor errs to be fixed

---
 python/flexflow/serve/models/starcoder.py | 108 ++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 0f577299ed..c651e06e1c 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -260,3 +260,111 @@ def convert_hf_model(model, dst_folder):
         model.lm_head.weight.detach().cpu().numpy().tofile(
             os.path.join(dst_folder, "lm_head_weight")
         )
+        
+        
+    def convert_ff_weight_name(name):
+        """
+        Convert weight names from FlexFlow format back to Hugging Face format.
+        """
+        # Example conversion logic, adjust as needed
+        if "attention_wq" in name or "attention_wk" in name or "attention_wv" in name:
+            converted_name = converted_name.replace("attention_wq", "attn.c_attn").replace("attention_wk", "attn.c_attn").replace("attention_wv", "attn.c_attn")
+        elif "attention_wo" in name:
+            converted_name = converted_name.replace("attention_wo", "attn.c_proj")
+        
+        converted_name = re.sub(r"layers_(\d+)_", r"transformer.h.\1.", converted_name)
+
+        return converted_name
+    
+    
+    def load_weights_into_hf_model(model, src_folder):
+        """
+        Load weights from a specified folder and apply them to a Hugging Face model.
+
+        Parameters:
+        - model: The instance of the Hugging Face model to load the weights into.
+        - src_folder: The path to the folder containing the weight files.
+        """
+
+        for file_name in os.listdir(src_folder):
+            weight_path = os.path.join(src_folder, file_name)
+            if weight_path.endswith("rev_sha.txt"):
+                print("skipping rev_sha.txt")
+                continue
+            else:
+                original_name = FlexFlowLLAMA.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print(f"Converting weight name: {file_name} to {original_name}")
+            
+            if not os.path.exists(weight_path):
+                raise FileNotFoundError(f"No weight file found for {file_name}")
+
+            weight_data = np.fromfile(weight_path, dtype=np.float32)
+
+            # Find the parameter in the model
+            param = model.state_dict().get(original_name)
+            if param is None:
+                print(f"Warning: {original_name} not found in model parameters.")
+                continue
+
+            # Special handling for q, k, v weights
+            if ("attention_wq" in original_name) or ("attention_wk" in original_name) or ("attention_wv" in original_name):
+                qkv_match = re.search("(wq|wk|wv)", file_name)
+                qkv_type = qkv_match.group(0) if qkv_match else None
+                layer_num_match = re.search(r"transformer.h.(\d+)", original_name)
+                layer_num = int(layer_num_match.group(1)) if layer_num_match else None
+                print(f"QKV type: {qkv_type}, Layer number: {layer_num}")
+                
+                if layer_num is not None:
+                    if layer_num not in qkv_weights:
+                        
+                        qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+                        if qkv_name in model.state_dict():
+                            qkv_param_size = model.state_dict()[qkv_name].shape[0]
+                        qkv_shape = (qkv_param_size, hidden_size)
+                        qkv_weights[layer_num] = np.zeros(qkv_shape)
+                        print(f"Initialized QKV shape for layer {layer_num}: {qkv_shape}")
+                        
+                    type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
+                    ## dim 0 sizes: 
+                    dim_wq = hidden_size
+                    dim_wk = hidden_size // n_head
+                    dim_wv = hidden_size // n_head
+                    
+                    try:
+                        expected_shape = (weight_data.size // hidden_size, hidden_size)
+                        reshaped_data = weight_data.reshape(expected_shape)
+                        print(f"Reshaped QKV weights for {qkv_type} in layer {layer_num} with shape {expected_shape}.")
+                    except ValueError as e:
+                        print(f"Error reshaping {qkv_type} weights for layer {layer_num}: {e}")
+                        print(f"Attempting to reshape data of size {weight_data.size} into shape (-1, {hidden_size})")
+                        
+                    try:
+                        if qkv_type == "wq":
+                            qkv_weights[layer_num][0:dim_wq, :] = reshaped_data
+                        elif qkv_type == "wk":
+                            qkv_weights[layer_num][dim_wq:dim_wk+dim_wq, :] = reshaped_data
+                        else:
+                            qkv_weights[layer_num][dim_wq+dim_wk:, :] = reshaped_data
+                    except ValueError as e:
+                        print(f"Error assigning {qkv_type} weights for layer {layer_num}: {e}")
+                continue
+
+
+            # Handle other parameters
+            param = model.state_dict().get(original_name)
+            if param is None:
+                print(f"Warning: {original_name} not found in model parameters.")
+                continue
+            reshaped_weight_data = weight_data.reshape(param.shape)
+            param.data.copy_(torch.from_numpy(reshaped_weight_data))
+            
+        
+        # Assign the combined QKV weights to the model
+        for layer_num, weight in qkv_weights.items():
+            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+            if qkv_name in model.state_dict():
+                param = model.state_dict()[qkv_name]
+                # Ensure the combined weight is correctly reshaped to fit the model's expectations
+                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
+                
+       

From b6dd20b8af480363b8096e3ee1f6c9c3cbc438a1 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 27 Feb 2024 01:32:21 +0000
Subject: [PATCH 15/55] fixed issues with mpt and starcoder models

---
 python/flexflow/serve/models/mpt.py       |   4 +-
 python/flexflow/serve/models/starcoder.py | 126 +++++++++++-----------
 2 files changed, 61 insertions(+), 69 deletions(-)

diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 3678341ca3..409abff755 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -287,7 +287,7 @@ def convert_ff_weight_name(name):
         # Reverses the conversion logic for MPT model weights
         converted_name = name
         if "norm_f" in converted_name or "wte" in converted_name:
-            converted_name = converted_name.replece("_", ".")
+            converted_name = converted_name.replace("_", ".").replace("norm.f", "norm_f")
             
         converted_name = converted_name.replace("attention_wo", "attn.out_proj")
         converted_name = converted_name.replace("ffn_", "ffn.")
@@ -369,6 +369,4 @@ def load_weights_into_hf_model(model, src_folder):
             model.state_dict()[qkv_name].copy_(torch.from_numpy(combined_qkv_reshaped))
 
             print(f"Assigned combined QKV weights to {qkv_key}.")
-                
-                
                 
\ No newline at end of file
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index c651e06e1c..1831a4d1e1 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -14,7 +14,8 @@
 
 from flexflow.core import *
 from .base import FlexFlowModel
-import random, torch
+import random, torch, re
+import numpy as np
 
 
 class STARCODERConfig:
@@ -266,13 +267,16 @@ def convert_ff_weight_name(name):
         """
         Convert weight names from FlexFlow format back to Hugging Face format.
         """
+        converted_name = name
         # Example conversion logic, adjust as needed
-        if "attention_wq" in name or "attention_wk" in name or "attention_wv" in name:
-            converted_name = converted_name.replace("attention_wq", "attn.c_attn").replace("attention_wk", "attn.c_attn").replace("attention_wv", "attn.c_attn")
-        elif "attention_wo" in name:
+        if "attention_wo" in name:
             converted_name = converted_name.replace("attention_wo", "attn.c_proj")
+            
+        converted_name = converted_name.replace("mlp_", "mlp.").replace("_ln_f", ".ln_f").replace("_wpe", ".wpe").replace("_wte", ".wte")
         
         converted_name = re.sub(r"layers_(\d+)_", r"transformer.h.\1.", converted_name)
+        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
+        
 
         return converted_name
     
@@ -285,86 +289,76 @@ def load_weights_into_hf_model(model, src_folder):
         - model: The instance of the Hugging Face model to load the weights into.
         - src_folder: The path to the folder containing the weight files.
         """
+        
+        hidden_size = model.config.hidden_size
+        n_head = (
+            model.config.n_head
+            if "n_head" in model.config.__dict__
+            else model.config.num_attention_heads
+        )
+        
+        print("Model hidden size:", hidden_size)
+        print("Model num_attention_heads:", n_head)
+        
+        num_attention_heads = n_head
+        hidden_size_per_head = hidden_size // n_head
+        
+        qkv_weights = {}
 
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
+            print("\nProcessing weight file:", weight_path)
             if weight_path.endswith("rev_sha.txt"):
                 print("skipping rev_sha.txt")
                 continue
             else:
-                original_name = FlexFlowLLAMA.convert_ff_weight_name(file_name.replace('.bin', ''))
-                print(f"Converting weight name: {file_name} to {original_name}")
+                original_name = FlexFlowSTARCODER.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print(f"Converted weight name: {file_name} to {original_name}")
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
 
             weight_data = np.fromfile(weight_path, dtype=np.float32)
-
-            # Find the parameter in the model
-            param = model.state_dict().get(original_name)
-            if param is None:
-                print(f"Warning: {original_name} not found in model parameters.")
-                continue
-
-            # Special handling for q, k, v weights
+            print(f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}")
+            
+            # Special handling for combined QKV weights
             if ("attention_wq" in original_name) or ("attention_wk" in original_name) or ("attention_wv" in original_name):
-                qkv_match = re.search("(wq|wk|wv)", file_name)
-                qkv_type = qkv_match.group(0) if qkv_match else None
-                layer_num_match = re.search(r"transformer.h.(\d+)", original_name)
+                weight_bias = ".weight" if ".weight" in original_name else ".bias"
+                layer_num_match = re.search(r"layers\_(\d+)", file_name)
                 layer_num = int(layer_num_match.group(1)) if layer_num_match else None
-                print(f"QKV type: {qkv_type}, Layer number: {layer_num}")
+                qkv_type = file_name.split("_")[-2]
+                qkv_name = f"transformer.h.{layer_num}.attn.c_attn" + weight_bias
                 
                 if layer_num is not None:
-                    if layer_num not in qkv_weights:
-                        
-                        qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
-                        if qkv_name in model.state_dict():
-                            qkv_param_size = model.state_dict()[qkv_name].shape[0]
-                        qkv_shape = (qkv_param_size, hidden_size)
-                        qkv_weights[layer_num] = np.zeros(qkv_shape)
-                        print(f"Initialized QKV shape for layer {layer_num}: {qkv_shape}")
-                        
-                    type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
-                    ## dim 0 sizes: 
-                    dim_wq = hidden_size
-                    dim_wk = hidden_size // n_head
-                    dim_wv = hidden_size // n_head
-                    
-                    try:
-                        expected_shape = (weight_data.size // hidden_size, hidden_size)
-                        reshaped_data = weight_data.reshape(expected_shape)
-                        print(f"Reshaped QKV weights for {qkv_type} in layer {layer_num} with shape {expected_shape}.")
-                    except ValueError as e:
-                        print(f"Error reshaping {qkv_type} weights for layer {layer_num}: {e}")
-                        print(f"Attempting to reshape data of size {weight_data.size} into shape (-1, {hidden_size})")
-                        
-                    try:
-                        if qkv_type == "wq":
-                            qkv_weights[layer_num][0:dim_wq, :] = reshaped_data
-                        elif qkv_type == "wk":
-                            qkv_weights[layer_num][dim_wq:dim_wk+dim_wq, :] = reshaped_data
-                        else:
-                            qkv_weights[layer_num][dim_wq+dim_wk:, :] = reshaped_data
-                    except ValueError as e:
-                        print(f"Error assigning {qkv_type} weights for layer {layer_num}: {e}")
+                    # initialize qkv layer in dict
+                    if qkv_name not in qkv_weights:
+                        qkv_weights[qkv_name] = {'wq': None, 'wk': None, 'wv': None}
+                        print(f"Initialized QKV layer {layer_num}")
+                    # assign weights into dict
+                    qkv_weights[qkv_name][qkv_type] = weight_data
+                    print(f"attached qkv weight {qkv_name}")
+                
                 continue
 
 
-            # Handle other parameters
-            param = model.state_dict().get(original_name)
-            if param is None:
-                print(f"Warning: {original_name} not found in model parameters.")
-                continue
-            reshaped_weight_data = weight_data.reshape(param.shape)
-            param.data.copy_(torch.from_numpy(reshaped_weight_data))
+            # for weights that are not q,k,v, get the param names and assign weights accordingly
+            param = model.state_dict().get(original_name, None)
+            if weight_data.size != param.numel():
+                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
             
-        
-        # Assign the combined QKV weights to the model
-        for layer_num, weight in qkv_weights.items():
-            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
-            if qkv_name in model.state_dict():
-                param = model.state_dict()[qkv_name]
-                # Ensure the combined weight is correctly reshaped to fit the model's expectations
-                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
+            weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+            with torch.no_grad():
+                model.state_dict()[original_name].copy_(weight_tensor)
+                print(f"Assigned weight {original_name} successfully!")
+                
                 
-       
+        for qkv_name, weights_dict in qkv_weights.items():
+            combined_qkv = np.concatenate([qkv_weights[qkv_name]['wq'], qkv_weights[qkv_name]['wk'], qkv_weights[qkv_name]['wv']], axis=0)
+            param_shape = model.state_dict()[qkv_name].shape
+            combined_qkv_reshaped = combined_qkv.reshape(param_shape)
+            print(f"reshaped qkv weights shape is: {combined_qkv_reshaped.shape}")
+
+            model.state_dict()[qkv_name].copy_(torch.from_numpy(combined_qkv_reshaped))
+            print(f"Assigned combined QKV weights to {qkv_name}.")
+        
+        
\ No newline at end of file

From 2b5c79b300c290153ce5e6f3cee67f357be49191 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 27 Feb 2024 01:34:15 +0000
Subject: [PATCH 16/55] modify hf uploading script

---
 inference/utils/download_upload_hf.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index f4e6db7775..c3d8df841c 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -37,9 +37,13 @@ def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
 
 
 def main():
-    model_name = "mosaicml/mpt-7b"
+    model_name = "bigcode/starcoderbase-1b"
+    # model_name = "mosaicml/mpt-7b"
+    
     # new_model_id = "your_username/new-model-name"
-    new_model_id = "aprilyyt/upload-mpt"  
+    new_model_id = "aprilyyt/upload-starcoder"  
+    # new_model_id = "aprilyyt/upload-mpt"  
+    
     cache_folder = "~/.cache/flexflow"
     private = True 
     refresh_cache = False

From 403516a717f416aeb8485c727fcf1b5a7a1c0069 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 27 Feb 2024 01:35:22 +0000
Subject: [PATCH 17/55] modify hf uploading script

---
 inference/utils/download_upload_peft.py | 35 ++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/inference/utils/download_upload_peft.py b/inference/utils/download_upload_peft.py
index 6e174eacf7..6870b0e04c 100644
--- a/inference/utils/download_upload_peft.py
+++ b/inference/utils/download_upload_peft.py
@@ -34,10 +34,37 @@ def upload_peft_model_to_hub(peft, new_model_id, cache_folder, private):
     print("Upload completed successfully.")
     
 
+# def main():
+#     args = parse_args()
+#     peft = download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
+#     upload_peft_model_to_hub(peft, args.new_model_id, args.cache_folder, args.private)
+
+# if __name__ == "__main__":
+#     main()
+
+
 def main():
-    args = parse_args()
-    peft = download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
-    upload_peft_model_to_hub(peft, args.new_model_id, args.cache_folder, args.private)
+    model_name = "meta-llama/Llama-2-7b"
+    new_model_id = "your_username/new-model-name"
+    cache_folder = "~/.cache/flexflow"
+    private = True 
+    refresh_cache = False
+    full_precision = True 
+
+    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
+    print(f"Downloading and processing peft model: {peft_model_id}")
+    peft = ff.PEFT(
+                peft_model_id,
+                data_type=data_type,
+                cache_path=args.cache_folder,
+                refresh_cache=args.refresh_cache,
+            )
+    peft.download_hf_weights_if_needed()
+    peft.download_hf_config()
+
+    print(f"Uploading processed model to Hugging Face Hub: {peft_model_id}")
+    peft.upload_hf_model(peft_model_id, cache_folder, private=private)
+    print("Upload completed successfully.")
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From af1d5354804daa3711ec817ce3c5c442b84f5f39 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 27 Feb 2024 01:37:55 +0000
Subject: [PATCH 18/55] modify hf uploading main

---
 inference/utils/download_upload_hf.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index c3d8df841c..40551c66b8 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -37,13 +37,8 @@ def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
 
 
 def main():
-    model_name = "bigcode/starcoderbase-1b"
-    # model_name = "mosaicml/mpt-7b"
-    
-    # new_model_id = "your_username/new-model-name"
-    new_model_id = "aprilyyt/upload-starcoder"  
-    # new_model_id = "aprilyyt/upload-mpt"  
-    
+    model_name = "meta-llama/Llama-2-7b"
+    new_model_id = "your_username/new-model-name"
     cache_folder = "~/.cache/flexflow"
     private = True 
     refresh_cache = False

From bea5afb932a2acd5b2334de39789307ff51bab74 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Fri, 1 Mar 2024 16:02:40 +0000
Subject: [PATCH 19/55] add assertion for base model

---
 python/flexflow/serve/models/base.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py
index 17bb894250..fe39fd30bf 100644
--- a/python/flexflow/serve/models/base.py
+++ b/python/flexflow/serve/models/base.py
@@ -37,3 +37,9 @@ def convert_hf_weight_name(name):
 
     def convert_hf_model(model, dst_folder):
         assert False, "Not implemented yet"
+        
+    def convert_ff_weight_name(name):
+        assert False, "Not implemented yet"
+        
+    def load_weights_into_hf_model(model, src_folder):
+        assert False, "Not implemented yet"

From 09013200056b02a72a5cb9ec80f2c3349f47e488 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 13 Feb 2024 12:36:22 +0000
Subject: [PATCH 20/55] upload hf model

---
 inference/utils/download_upload_hf.py | 49 +++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 inference/utils/download_upload_hf.py

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
new file mode 100644
index 0000000000..db0b131b8e
--- /dev/null
+++ b/inference/utils/download_upload_hf.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import argparse
+from huggingface_hub import HfApi, HfFolder
+import flexflow.serve as ff
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
+    parser.add_argument("model_name", type=str, help="Original Hugging Face model ID to download and process (e.g., 'facebook/opt-125m').")
+    parser.add_argument("--new-model-id", type=str, required=True, help="New Hugging Face Hub model ID for upload (e.g., 'your_username/new-model-name').")
+    parser.add_argument("--cache-folder", type=str, default="./model_cache", help="Folder to use to store and process the model(s) assets in FlexFlow format.")
+    parser.add_argument("--private", action="store_true", help="Whether to upload the processed model as a private model on Hugging Face Hub.")
+    parser.add_argument("--refresh-cache", action="store_true", help="Use this flag to force the refresh of the model(s) weights/tokenizer cache.")
+    parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights.")
+    return parser.parse_args()
+
+def download_and_process_model(model_name, cache_folder, refresh_cache, full_precision):
+    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
+    print(f"Downloading and processing model: {model_name}")
+    llm = ff.LLM(
+        model_name=model_name,
+        data_type=data_type,
+        cache_path=cache_folder,
+        refresh_cache=refresh_cache,
+    )
+    llm.download_hf_weights_if_needed()
+    llm.download_hf_tokenizer_if_needed()
+    llm.download_hf_config()
+    # any necessary conversion or processing by FlexFlow happens here
+
+def upload_processed_model_to_hub(new_model_id, cache_folder, private):
+    print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
+    api = HfApi()
+    if not HfFolder.get_token():
+        print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+        return
+    api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
+    api.upload_folder(folder_path=cache_folder, repo_id=new_model_id)
+    print("Upload completed successfully.")
+
+def main():
+    args = parse_args()
+    download_and_process_model(args.model_name, args.cache_folder, args.refresh_cache, args.full_precision)
+    upload_processed_model_to_hub(args.new_model_id, args.cache_folder, args.private)
+
+if __name__ == "__main__":
+    main()
+
+
+# python download_upload_hf.py facebook/opt-125m --new-model-id username/modelname --cache-folder ./model_cache --private
\ No newline at end of file

From 03498381e99ad028943983177082d711b40c5e00 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 13 Feb 2024 12:50:42 +0000
Subject: [PATCH 21/55] upload peft model

---
 inference/utils/download_upload_peft.py | 45 +++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 inference/utils/download_upload_peft.py

diff --git a/inference/utils/download_upload_peft.py b/inference/utils/download_upload_peft.py
new file mode 100644
index 0000000000..85d79f7f2a
--- /dev/null
+++ b/inference/utils/download_upload_peft.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+import argparse
+from huggingface_hub import HfApi, HfFolder
+import flexflow.serve as ff
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Download a PEFT model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
+    parser.add_argument("peft_model_id", type=str, help="Original Hugging Face PEFT model ID to download and process (e.g., 'username/peft-model').")
+    parser.add_argument("--new-model-id", type=str, required=True, help="New Hugging Face Hub model ID for upload (e.g., 'your_username/new-peft-model-name').")
+    parser.add_argument("--cache-folder", type=str, default="./peft_model_cache", help="Folder to use to store and process the PEFT model(s) assets in FlexFlow format.")
+    parser.add_argument("--private", action="store_true", help="Whether to upload the processed PEFT model as a private model on Hugging Face Hub.")
+    parser.add_argument("--refresh-cache", action="store_true", help="Use this flag to force the refresh of the PEFT model(s) weights/cache.")
+    parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights for the PEFT model.")
+    return parser.parse_args()
+
+def download_and_process_peft_model(peft_model_id, cache_folder, refresh_cache, full_precision):
+    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
+    print(f"Downloading and processing PEFT model: {peft_model_id}")
+    peft = ff.PEFT(
+        peft_model_id=peft_model_id,
+        data_type=data_type,
+        cache_path=cache_folder,
+        refresh_cache=refresh_cache,
+    )
+    peft.download_hf_weights_if_needed()
+    peft.download_hf_config()
+    # any necessary conversion or processing by FlexFlow happens here
+
+def upload_processed_peft_model_to_hub(new_model_id, cache_folder, private):
+    print(f"Uploading processed PEFT model to Hugging Face Hub: {new_model_id}")
+    api = HfApi()
+    if not HfFolder.get_token():
+        print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+        return
+    api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
+    api.upload_folder(folder_path=cache_folder, repo_id=new_model_id)
+    print("Upload completed successfully.")
+
+def main():
+    args = parse_args()
+    download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
+    upload_processed_peft_model_to_hub(args.new_model_id, args.cache_folder, args.private)
+
+if __name__ == "__main__":
+    main()

From d7a23bbbb2218d209821f75bc5a4f8c3f3d4d084 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Fri, 16 Feb 2024 12:14:39 +0000
Subject: [PATCH 22/55] refactor uploading hf

---
 inference/utils/download_upload_hf.py |  20 +--
 python/flexflow/serve/serve.py        | 223 +++++++++++++++++++++++++-
 2 files changed, 231 insertions(+), 12 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index db0b131b8e..83541e7894 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -3,6 +3,7 @@
 from huggingface_hub import HfApi, HfFolder
 import flexflow.serve as ff
 
+
 def parse_args():
     parser = argparse.ArgumentParser(description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
     parser.add_argument("model_name", type=str, help="Original Hugging Face model ID to download and process (e.g., 'facebook/opt-125m').")
@@ -13,6 +14,7 @@ def parse_args():
     parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights.")
     return parser.parse_args()
 
+
 def download_and_process_model(model_name, cache_folder, refresh_cache, full_precision):
     data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
     print(f"Downloading and processing model: {model_name}")
@@ -25,22 +27,20 @@ def download_and_process_model(model_name, cache_folder, refresh_cache, full_pre
     llm.download_hf_weights_if_needed()
     llm.download_hf_tokenizer_if_needed()
     llm.download_hf_config()
-    # any necessary conversion or processing by FlexFlow happens here
+    return llm
+
 
-def upload_processed_model_to_hub(new_model_id, cache_folder, private):
+def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
     print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
-    api = HfApi()
-    if not HfFolder.get_token():
-        print("Hugging Face token not found. Please login using `huggingface-cli login`.")
-        return
-    api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
-    api.upload_folder(folder_path=cache_folder, repo_id=new_model_id)
+    llm.upload_hf_model(new_model_id, private=private)
     print("Upload completed successfully.")
 
+
 def main():
     args = parse_args()
-    download_and_process_model(args.model_name, args.cache_folder, args.refresh_cache, args.full_precision)
-    upload_processed_model_to_hub(args.new_model_id, args.cache_folder, args.private)
+    llm = download_and_process_model(args.model_name, args.cache_folder, args.refresh_cache, args.full_precision)
+    upload_processed_model_to_hub(llm, args.new_model_id, args.cache_folder, args.private)
+
 
 if __name__ == "__main__":
     main()
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 538abe3858..787ab15039 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -29,8 +29,8 @@
 from flexflow.core import *
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
 from peft import PeftModel, PeftConfig
-from huggingface_hub import HfApi
-import torch, shutil, hashlib, json, gc
+from huggingface_hub import HfApi, HfFolder, Repository
+import torch, shutil, hashlib, json, gc, os
 from typing import Union, List
 
 
@@ -141,6 +141,7 @@ def get_ff_peft_id(self, peft_model_id: str) -> PEFTModelID:
             )
         return peft_dict["ff_peft_model_id"]
 
+
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
         config_dir = os.path.join(
@@ -342,6 +343,43 @@ def download_hf_tokenizer_if_needed(self):
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
                 f.write(latest_revision)
+        else:
+            print(f"Loading '{self.model_name}' tokenizer from the cache...")
+
+    def __load_hf_weights(self):
+        print("Loading hf weights...")
+
+        self.download_hf_weights_if_needed()
+
+        # Create file data loader, load weights into tensors
+        model_configs = self.config_class(self.hf_config)
+
+        self.fileloader = FileDataLoader(
+            self.weights_path,
+            model_configs.num_attention_heads,
+            model_configs.num_key_value_heads,
+            model_configs.hidden_size,
+            model_configs.hidden_size // model_configs.num_attention_heads,
+            self.ffconfig.tensor_parallelism_degree,
+        )
+
+        self.fileloader.load_weights(self.model.ffmodel, self.data_type)
+        
+    def upload_hf_model(self, new_model_id: str, private: bool = False):
+        """
+        Uploads the model weights to the Hugging Face Hub.
+        
+        :param repo_id: The repository ID, including the organization/user and model name (e.g., "organization/model_name").
+        :param private: Whether to upload the model as a private model.
+        """
+        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
+        if not HfFolder.get_token():
+            print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+            return
+        api = HfApi()
+        api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
+        api.upload_folder(folder_path=self.cache_path, repo_id=new_model_id)
+        print("Upload completed successfully.")
 
     def compile(
         self,
@@ -591,3 +629,184 @@ def compile(
             model_specific_pipeline_parallelism_degree,
             ssms,
         )
+
+class PEFT:
+    """This class creates a PEFT (parameter-efficient transformer) object to be used in concert with a LLM or SSM"""
+
+    def __init__(
+        self,
+        peft_model_id: str,
+        data_type: DataType = DataType.DT_HALF,
+        cache_path: str = "",
+        refresh_cache: bool = False,
+    ):
+        self.hf_config = PeftConfig.from_pretrained(peft_model_id)
+        self.peft_model_id = peft_model_id
+        self.peft_type = self.hf_config.peft_type
+        if self.peft_type != "LORA":
+            raise RuntimeError(
+                f"PEFT type {self.peft_type} not yet supported in FlexFlow"
+            )
+        self.data_type = data_type
+        assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT
+        self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
+        self.refresh_cache = refresh_cache
+        # Base model related
+        if "base_model_name_or_path" not in self.hf_config.to_dict():
+            raise ValueError(
+                f"PEFT model {peft_model_id} does not have an associated based model"
+            )
+        self.base_model = LLM(
+            self.hf_config.base_model_name_or_path, data_type, cache_path, refresh_cache
+        )
+
+    def download_hf_config(self):
+        """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
+        self.config_dir = os.path.join(
+            os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower()
+        )
+        self.config_path = os.path.join(self.config_dir, "config.json")
+        os.makedirs(self.config_dir, exist_ok=True)
+        print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
+        print(f"Saving {self.peft_model_id} configs to file {self.config_path}...")
+        with open(self.config_path, "w") as json_file:
+            class SetEncoder(json.JSONEncoder):
+                def default(self, obj):
+                    if isinstance(obj, set):
+                        return list(obj)
+                    return super().default(obj)
+            json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder)
+
+    def __get_revision_hashes(self, peft_model_id: str):
+        ff_revision = None
+        ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt")
+        if os.path.exists(ff_revision_file):
+            ff_revision = "".join(open(ff_revision_file).read().split())
+
+        if os.path.exists(peft_model_id) and os.path.isdir(peft_model_id):
+            # Local model
+            files = os.listdir(peft_model_id)
+            state = files + [
+                os.path.getmtime(os.path.join(peft_model_id, f)) for f in files
+            ]
+            latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest()
+        else:
+            # Remote HuggingFace model
+            hf_api = HfApi()
+            latest_revision = hf_api.model_info(self.peft_model_id).sha
+        return ff_revision, ff_revision_file, latest_revision
+
+    def convert_peft_model(self, hf_peft_model, weights_path):
+        for name, params in hf_peft_model.named_parameters():
+            if self.peft_type.lower() in name:
+                name = name.replace("base_model.model.model.", "").replace(
+                    ".default", ""
+                )
+                name = self.base_model.model_class.convert_hf_weight_name(name)
+                params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+
+    def download_hf_weights_if_needed(self):
+        """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date.
+        If not, or if the refresh_cache parameter is set to True, download new weights.
+        """
+        # Use local cache, or download new version
+        self.weights_path = os.path.join(
+            os.path.expanduser(self.cache_path),
+            "weights",
+            self.peft_model_id.lower(),
+            "full-precision"
+            if self.data_type == DataType.DT_FLOAT
+            else "half-precision",
+        )
+        if self.refresh_cache:
+            print(
+                f"Refreshing weights in cache for model {self.peft_model_id} at path {self.weights_path} ..."
+            )
+            if os.path.exists(self.weights_path):
+                shutil.rmtree(self.weights_path)
+        os.makedirs(self.weights_path, exist_ok=True)
+        print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
+
+        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+            self.peft_model_id
+        )
+
+        # Download if needed
+        if ff_revision != latest_revision:
+            if not os.path.exists(self.peft_model_id) or os.path.isdir(
+                self.peft_model_id
+            ):
+                # Local model
+                print(
+                    f"'{self.peft_model_id}' model weights not found in cache or outdated. Downloading from huggingface.co ..."
+                )
+            else:
+                # Remote model
+                print(
+                    f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..."
+                )
+            # Download base model from HuggingFace, or load it from the local folder
+            self.base_model.download_hf_weights_if_needed()
+            self.base_model.download_hf_tokenizer_if_needed()
+            self.base_model.download_hf_config()
+            hf_base_model = AutoModelForCausalLM.from_pretrained(
+                self.hf_config.base_model_name_or_path,
+                return_dict=True,
+                trust_remote_code=True,
+                torch_dtype=torch.float32
+                if self.data_type == DataType.DT_FLOAT
+                else torch.float16,
+                # device_map="auto",
+            )
+            hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id)
+            # Print log message to notify user download of model has finished
+            if not os.path.exists(self.peft_model_id) or os.path.isdir(
+                self.peft_model_id
+            ):
+                print("Done downloading HF weights. Converting them now...")
+            # Convert the model to FlexFlow format
+            self.convert_peft_model(hf_peft_model, self.weights_path)
+            # Save new revision hash to file
+            with open(ff_revision_file, "w+") as f:
+                f.write(latest_revision)
+            print("Done converting the weights...")
+            # Deallocate hf model
+            del hf_peft_model
+            del hf_base_model
+            gc.collect()
+            torch.cuda.empty_cache()
+        else:
+            print(f"Loading '{self.peft_model_id}' model weights from the cache...")
+
+    def upload_model_to_hf(self, model_directory: str, model_id: str, private: bool = False):
+        """
+        Uploads the model from the specified directory to the Hugging Face Hub.
+
+        Args:
+        - model_directory (str): The directory where the model and its configuration are stored.
+        - model_id (str): The desired model ID on the Hugging Face Hub (e.g., "username/model_name").
+        - private (bool): If True, the model will be uploaded as a private model.
+        """
+        try:
+            # Check for Hugging Face CLI authentication
+            if not HfFolder.get_token():
+                raise ValueError("Hugging Face token not found. Please log in using `huggingface-cli login`.")
+            
+            # Ensure the specified directory contains model files
+            if not os.listdir(model_directory):
+                raise FileNotFoundError(f"No files found in {model_directory}. Please check the path and try again.")
+
+            # Create or get the repository
+            repo_url = HfApi().create_repo(name=model_id, private=private, exist_ok=True, use_auth_token=True)
+            print(f"Repository URL: {repo_url}")
+
+            # Initialize the repository, add files, commit, and push
+            repo = Repository(local_dir=model_directory, clone_from=repo_url, use_auth_token=True)
+            repo.git_add()
+            repo.git_commit("Upload model to Hugging Face Hub")
+            repo.git_push()
+
+            print(f"Model '{model_id}' successfully uploaded to the Hugging Face Hub.")
+        except Exception as e:
+            print(f"Failed to upload the model: {e}")
+        
\ No newline at end of file

From 08cf15e2d88179d3bb8ec7f02c740f3057b9e7c2 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Fri, 16 Feb 2024 12:31:23 +0000
Subject: [PATCH 23/55] refactor uploading peft

---
 inference/utils/download_upload_hf.py   |  2 +-
 inference/utils/download_upload_peft.py | 20 +++++++--------
 python/flexflow/serve/serve.py          | 33 +++++++++++++++++++++----
 3 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index 83541e7894..c400355c0a 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -32,7 +32,7 @@ def download_and_process_model(model_name, cache_folder, refresh_cache, full_pre
 
 def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
     print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
-    llm.upload_hf_model(new_model_id, private=private)
+    llm.upload_hf_model(new_model_id, cache_folder, private=private)
     print("Upload completed successfully.")
 
 
diff --git a/inference/utils/download_upload_peft.py b/inference/utils/download_upload_peft.py
index 85d79f7f2a..6e174eacf7 100644
--- a/inference/utils/download_upload_peft.py
+++ b/inference/utils/download_upload_peft.py
@@ -25,21 +25,19 @@ def download_and_process_peft_model(peft_model_id, cache_folder, refresh_cache,
     peft.download_hf_weights_if_needed()
     peft.download_hf_config()
     # any necessary conversion or processing by FlexFlow happens here
-
-def upload_processed_peft_model_to_hub(new_model_id, cache_folder, private):
-    print(f"Uploading processed PEFT model to Hugging Face Hub: {new_model_id}")
-    api = HfApi()
-    if not HfFolder.get_token():
-        print("Hugging Face token not found. Please login using `huggingface-cli login`.")
-        return
-    api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
-    api.upload_folder(folder_path=cache_folder, repo_id=new_model_id)
+    return peft
+    
+    
+def upload_peft_model_to_hub(peft, new_model_id, cache_folder, private):
+    print(f"Uploading peft model to HuggingFace Hub: {new_model_id}")
+    peft.upload_hf_model(new_model_id, cache_folder, private=private)
     print("Upload completed successfully.")
+    
 
 def main():
     args = parse_args()
-    download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
-    upload_processed_peft_model_to_hub(args.new_model_id, args.cache_folder, args.private)
+    peft = download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
+    upload_peft_model_to_hub(peft, args.new_model_id, args.cache_folder, args.private)
 
 if __name__ == "__main__":
     main()
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 787ab15039..a399b0c4eb 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -365,7 +365,7 @@ def __load_hf_weights(self):
 
         self.fileloader.load_weights(self.model.ffmodel, self.data_type)
         
-    def upload_hf_model(self, new_model_id: str, private: bool = False):
+    def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
         """
         Uploads the model weights to the Hugging Face Hub.
         
@@ -379,6 +379,7 @@ def upload_hf_model(self, new_model_id: str, private: bool = False):
         api = HfApi()
         api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
         api.upload_folder(folder_path=self.cache_path, repo_id=new_model_id)
+        # api.upload_folder(folder_path=model_path, repo_id=new_model_id)
         print("Upload completed successfully.")
 
     def compile(
@@ -778,15 +779,15 @@ def download_hf_weights_if_needed(self):
         else:
             print(f"Loading '{self.peft_model_id}' model weights from the cache...")
 
-    def upload_model_to_hf(self, model_directory: str, model_id: str, private: bool = False):
+    def process_and_upload_hf_model(self, model_id: str, private: bool = False):
         """
-        Uploads the model from the specified directory to the Hugging Face Hub.
+        Processes the PEFT model and uploads it to the Hugging Face Hub.
 
         Args:
-        - model_directory (str): The directory where the model and its configuration are stored.
         - model_id (str): The desired model ID on the Hugging Face Hub (e.g., "username/model_name").
         - private (bool): If True, the model will be uploaded as a private model.
         """
+<<<<<<< HEAD
         try:
             # Check for Hugging Face CLI authentication
             if not HfFolder.get_token():
@@ -809,4 +810,26 @@ def upload_model_to_hf(self, model_directory: str, model_id: str, private: bool
             print(f"Model '{model_id}' successfully uploaded to the Hugging Face Hub.")
         except Exception as e:
             print(f"Failed to upload the model: {e}")
-        
\ No newline at end of file
+        
+=======
+        self.download_hf_weights_if_needed()
+        model_directory = self.weights_path 
+        self.upload_model_to_hf(model_directory, model_id, private)
+
+    def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
+        """
+        Uploads the processed PEFT model to the Hugging Face Hub.
+        
+        :param new_model_id: The new repository ID on Hugging Face Hub, including the organization/user and model name (e.g., "your_username/new-peft-model-name").
+        :param private: Whether to upload the model as a private model on Hugging Face Hub.
+        """
+        print(f"Uploading processed PEFT model to Hugging Face Hub: {new_model_id}")
+        if not HfFolder.get_token():
+            print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+            return
+        api = HfApi()
+        api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
+        api.upload_folder(folder_path=self.cache_path, repo_id=new_model_id)
+        # api.upload_folder(folder_path=model_path, repo_id=new_model_id)
+        print("Upload completed successfully.")
+>>>>>>> 4b760ac8 (refactor uploading peft)

From 6b476a6d442208180947c7eab1f7d3fbfc37a7a7 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 19 Feb 2024 13:55:57 +0000
Subject: [PATCH 24/55] modify upload logic and add reconvert functions for opt
 models

---
 python/flexflow/serve/models/opt.py | 58 +++++++++++++++++++++++++++++
 python/flexflow/serve/serve.py      | 34 ++++++++++++++---
 2 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index cde25f2241..16cc5ec27e 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -15,6 +15,8 @@
 from flexflow.core import *
 from .base import FlexFlowModel
 import random, shutil
+import re
+import torch
 
 
 class OPTConfig:
@@ -299,3 +301,59 @@ def convert_hf_model(model, dst_folder):
             os.path.join(dst_folder, "embed_tokens.weight"),
             os.path.join(dst_folder, "lm_head.weight"),
         )
+        
+    def convert_ff_weight_name(name):
+        # Reverse the previous conversion rules
+        converted_name = (
+            name.replace("wq", "q_proj")
+            .replace("wk", "k_proj")
+            .replace("wv", "v_proj")
+            .replace("wo", "out_proj")
+            .replace("attention", "self_attn")
+            .replace("add_bias_residual_layer_norm_attn_bias", "attention_wo_bias")
+            .replace("_add_bias_residual_layer_norm", "_final_layer_norm")
+            .replace("_bias", ".bias")
+            .replace("_weight", ".weight")
+            .replace("_bias", ".bias")
+        )
+        
+        converted_name = re.sub(r"layers_(\d+)_", r"layers.\1.", converted_name)
+        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
+        converted_name = re.sub(r"self_attn_(?!layer_norm)", "self_attn.", converted_name)
+        
+        # Prepend "model.decoder." to the weight name
+        converted_name = "model.decoder." + converted_name
+        
+        return converted_name
+
+
+    def load_weights_into_hf_model(model, src_folder):
+        """
+        Load weights from a specified folder and apply them to a Hugging Face model.
+        
+        Parameters:
+        - model: The instance of the Hugging Face model to load the weights into.
+        - src_folder: The path to the folder containing the weight files.
+        """
+        for file_name in os.listdir(src_folder):
+            weight_path = os.path.join(src_folder, file_name)
+            print("converting weight name: ", weight_path)
+            original_name = FlexFlowOPT.convert_ff_weight_name(file_name.replace('.bin', ''))
+            print("original name of the weights is: ", original_name)
+            
+            if not os.path.exists(weight_path):
+                raise FileNotFoundError(f"No weight file found for {file_name}")
+            
+            # weight_data = np.fromfile(weight_path, dtype=np.float32)
+            weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
+            if original_name not in model.state_dict():
+                raise KeyError(f"Parameter {original_name} not found in model.")
+            param = model.state_dict()[original_name]
+            
+            if weight_data.size != param.numel():
+                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+            
+            weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+            with torch.no_grad():
+                # Update the model's state dict directly since param.copy_ doesn't work on tensor slices or elements not in place
+                model.state_dict()[original_name].copy_(weight_tensor)
\ No newline at end of file
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index a399b0c4eb..edeb09a291 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -367,19 +367,43 @@ def __load_hf_weights(self):
         
     def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
         """
-        Uploads the model weights to the Hugging Face Hub.
+        Uploads the model weights to the Hugging Face Hub, with reverse conversion of weights.
         
-        :param repo_id: The repository ID, including the organization/user and model name (e.g., "organization/model_name").
+        :param new_model_id: The new model ID for the Hugging Face Hub.
+        :param model_path: The path where the FlexFlow weights are stored.
         :param private: Whether to upload the model as a private model.
         """
-        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
+        print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
+        
+        # Initialize a new Hugging Face model instance
+        hf_model = AutoModelForCausalLM.from_config(self.hf_config)
+        weights_path = self.weights_path
+        
+        # Load FlexFlow weights into the Hugging Face model instance
+        try:
+            self.model_class.load_weights_into_hf_model(hf_model, weights_path)
+        except Exception as e:
+            print(f"Error loading weights into model: {e}")
+            return
+        
+        # Save the model with converted weights to a temporary directory
+        temp_dir = tempfile.mkdtemp()
+        hf_model.save_pretrained(temp_dir)
+        
+        # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")
             return
+        
+        # Upload the model
         api = HfApi()
+        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
         api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
-        api.upload_folder(folder_path=self.cache_path, repo_id=new_model_id)
-        # api.upload_folder(folder_path=model_path, repo_id=new_model_id)
+        api.upload_folder(folder_path=temp_dir, repo_id=new_model_id)
+        
+        # Cleanup temporary directory
+        shutil.rmtree(temp_dir)
+        
         print("Upload completed successfully.")
 
     def compile(

From 0a1029a850ff73b2b1755c2ed351fd7caeb20c58 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 20 Feb 2024 00:21:00 +0000
Subject: [PATCH 25/55] fix opt weight name converting issues

---
 python/flexflow/serve/models/opt.py | 19 +++++++++++--------
 python/flexflow/serve/serve.py      |  8 ++++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 16cc5ec27e..982ac156d1 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -305,21 +305,20 @@ def convert_hf_model(model, dst_folder):
     def convert_ff_weight_name(name):
         # Reverse the previous conversion rules
         converted_name = (
-            name.replace("wq", "q_proj")
+            name
+            .replace("add_bias_residual_layer_norm_attn_bias", "attention_wo_bias")
+            .replace("_add_bias_residual_layer_norm", "_final_layer_norm")
+            .replace("wq", "q_proj")
             .replace("wk", "k_proj")
             .replace("wv", "v_proj")
             .replace("wo", "out_proj")
             .replace("attention", "self_attn")
-            .replace("add_bias_residual_layer_norm_attn_bias", "attention_wo_bias")
-            .replace("_add_bias_residual_layer_norm", "_final_layer_norm")
-            .replace("_bias", ".bias")
-            .replace("_weight", ".weight")
-            .replace("_bias", ".bias")
         )
         
         converted_name = re.sub(r"layers_(\d+)_", r"layers.\1.", converted_name)
         converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
         converted_name = re.sub(r"self_attn_(?!layer_norm)", "self_attn.", converted_name)
+        converted_name = converted_name.replace("embed_tokens_weight_lm_head", "embed_tokens.weight")
         
         # Prepend "model.decoder." to the weight name
         converted_name = "model.decoder." + converted_name
@@ -338,8 +337,12 @@ def load_weights_into_hf_model(model, src_folder):
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
             print("converting weight name: ", weight_path)
-            original_name = FlexFlowOPT.convert_ff_weight_name(file_name.replace('.bin', ''))
-            print("original name of the weights is: ", original_name)
+            if weight_path.endswith("rev_sha.txt"):
+                print("skipping rev_sha.txt")
+                continue
+            else:
+                original_name = FlexFlowOPT.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print("original name of the weights is: ", original_name)
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index edeb09a291..1d258307a7 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -32,6 +32,8 @@
 from huggingface_hub import HfApi, HfFolder, Repository
 import torch, shutil, hashlib, json, gc, os
 from typing import Union, List
+import tempfile
+
 
 
 class _SupportedModels:
@@ -374,6 +376,7 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         :param private: Whether to upload the model as a private model.
         """
         print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
+        print("tokenizer path is: ", self.tokenizer_path)
         
         # Initialize a new Hugging Face model instance
         hf_model = AutoModelForCausalLM.from_config(self.hf_config)
@@ -390,6 +393,11 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         temp_dir = tempfile.mkdtemp()
         hf_model.save_pretrained(temp_dir)
         
+        # Copy the tokenizer files to the temporary directory
+        tokenizer_files = [f for f in os.listdir(self.tokenizer_path)]
+        for file_name in tokenizer_files:
+            shutil.copy(os.path.join(self.tokenizer_path, file_name), temp_dir)
+            
         # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")

From 5df977d57b78c496d6e6ee219b61421262ad316f Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 20 Feb 2024 13:12:24 +0000
Subject: [PATCH 26/55] implement revert functions for falcon models

---
 python/flexflow/serve/models/falcon.py | 75 ++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 0176a1dda1..1d367d43de 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -277,3 +277,78 @@ def convert_hf_model(model, dst_folder):
         model.lm_head.weight.detach().cpu().numpy().tofile(
             os.path.join(dst_folder, "lm_head.weight")
         )
+
+    
+    def convert_ff_weight_name(name):
+        
+        converted_name = name
+        converted_name = converted_name.replace("mlp_dense_h_to_4h", "mlp.dense_h_to_4h")
+        converted_name = converted_name.replace("mlp_dense_4h_to_h", "mlp.dense_4h_to_h")
+        converted_name = converted_name.replace("attention_wo", "self_attention.dense")
+        
+        converted_name = re.sub(r"layers_(\d+)_", r"transformer.h.\1.", converted_name)
+        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
+
+        return converted_name
+
+
+    def load_weights_into_hf_model(model, src_folder):
+        """
+        Load weights from a specified folder and apply them to a Hugging Face model.
+        
+        Parameters:
+        - model: The instance of the Hugging Face model to load the weights into.
+        - src_folder: The path to the folder containing the weight files.
+        - config: The configuration object for the model.
+        """
+        # Dictionary to hold the combined QKV weights
+        qkv_weights = {}
+        
+        for file_name in os.listdir(src_folder):
+            weight_path = os.path.join(src_folder, file_name)
+            print("converting weight file: ", weight_path)
+            original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
+            print("weight name after conversion: ", original_name)
+            
+            if not os.path.exists(weight_path):
+                raise FileNotFoundError(f"No weight file found for {file_name}")
+            
+            weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
+            
+            # Check if this is a Q, K, or V weight and combine them
+            if "attention_w" in original_name:
+                # Extract the type (Q, K, or V) and the layer number from the file name
+                qkv_type = re.search("(wq|wk|wv)", file_name).group(0)
+                layer_num = re.search("transformer.h.(\d+)", file_name).group(1)
+                
+                # Initialize the combined QKV weight if it doesn't exist
+                if layer_num not in qkv_weights:
+                    qkv_weights[layer_num] = np.zeros((3 * model.config.hidden_size, model.config.hidden_size))
+                
+                # Determine the position to place this weight in the combined QKV weight
+                type_index = {"wq": 0, "wk": 1, "wv": 2}[qkv_type]
+                qkv_weights[layer_num][type_index * model.config.hidden_size : (type_index + 1) * model.config.hidden_size] = weight_data
+                
+            elif original_name not in model.state_dict():
+                raise KeyError(f"Parameter {original_name} not found in model.")
+            else:
+                param = model.state_dict()[original_name]
+                if weight_data.size != param.numel():
+                    raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+                
+                weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+                with torch.no_grad():
+                    param.copy_(weight_tensor)
+        
+        # assign the combined QKV weights to the model
+        for layer_num, combined_weight_data in qkv_weights.items():
+            original_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+            
+            if original_name not in model.state_dict():
+                raise KeyError(f"Parameter {original_name} not found in model.")
+            
+            param = model.state_dict()[original_name]
+            combined_weight_tensor = torch.from_numpy(combined_weight_data).view(param.shape)
+            
+            with torch.no_grad():
+                param.copy_(combined_weight_tensor)

From dbbf1cd28dd88a542bc6fcf6bbcdd72411a22f18 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Wed, 21 Feb 2024 12:04:55 +0000
Subject: [PATCH 27/55] upload method for peft class and falcon fixes

---
 python/flexflow/serve/models/falcon.py | 96 ++++++++++++++++----------
 python/flexflow/serve/serve.py         | 38 ++++++++--
 2 files changed, 92 insertions(+), 42 deletions(-)

diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 1d367d43de..2e82179648 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -15,6 +15,7 @@
 from flexflow.core import *
 from .base import FlexFlowModel
 import random, torch
+import re
 
 
 class FalconConfig:
@@ -301,54 +302,79 @@ def load_weights_into_hf_model(model, src_folder):
         - src_folder: The path to the folder containing the weight files.
         - config: The configuration object for the model.
         """
-        # Dictionary to hold the combined QKV weights
+        
+        
+        print("Model hidden size:", model.config.hidden_size)
+        print("Model num_attention_heads:", model.config.num_attention_heads)
+        
+        hidden_size = model.config.hidden_size
+        num_attention_heads = model.config.num_attention_heads
+        hidden_size_per_head = hidden_size // num_attention_heads
+        intermediate_size = hidden_size * 4
+        
         qkv_weights = {}
         
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
             print("converting weight file: ", weight_path)
             original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
-            print("weight name after conversion: ", original_name)
+            print("weight name after conversion from flexflow: ", original_name)
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
             
             weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
             
-            # Check if this is a Q, K, or V weight and combine them
             if "attention_w" in original_name:
-                # Extract the type (Q, K, or V) and the layer number from the file name
-                qkv_type = re.search("(wq|wk|wv)", file_name).group(0)
-                layer_num = re.search("transformer.h.(\d+)", file_name).group(1)
-                
-                # Initialize the combined QKV weight if it doesn't exist
-                if layer_num not in qkv_weights:
-                    qkv_weights[layer_num] = np.zeros((3 * model.config.hidden_size, model.config.hidden_size))
-                
-                # Determine the position to place this weight in the combined QKV weight
-                type_index = {"wq": 0, "wk": 1, "wv": 2}[qkv_type]
-                qkv_weights[layer_num][type_index * model.config.hidden_size : (type_index + 1) * model.config.hidden_size] = weight_data
+                qkv_match = re.search("(wq|wk|wv)", file_name)
+                qkv_type = qkv_match.group(0) if qkv_match else None
+                layer_num_match = re.search(r"transformer.h.(\d+)", original_name)
+                layer_num = int(layer_num_match.group(1)) if layer_num_match else None
                 
-            elif original_name not in model.state_dict():
-                raise KeyError(f"Parameter {original_name} not found in model.")
+                if layer_num is not None:
+                    if layer_num not in qkv_weights:
+                        # For each layer, initialize space for Q, K, V weights for all heads
+                        # Each head has hidden_size_per_head elements, and there are num_attention_heads heads
+                        # For Q, K, V together, it's 3 * hidden_size_per_head * num_attention_heads
+                        qkv_shape = (3 * hidden_size_per_head * num_attention_heads, hidden_size)
+                        qkv_weights[layer_num] = np.zeros(qkv_shape)
+                    
+                    # Calculate index for Q, K, or V weight segment within the combined QKV weight
+                    type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
+                    offset = type_index * hidden_size_per_head * num_attention_heads
+                    # Reshape the weight data to fit into the combined QKV weight matrix
+                    reshaped_data = weight_data.reshape(-1, hidden_size)
+                    qkv_weights[layer_num][offset:offset+reshaped_data.shape[0], :] = reshaped_data
+            
+            elif "mlp.dense_h_to_4h" in original_name or "mlp.dense_4h_to_h" in original_name:
+                # Handle MLP weights
+                if "mlp.dense_h_to_4h" in original_name:
+                    total_elements = weight_data.size
+                    output_size = total_elements // hidden_size
+                    expected_shape = (output_size, hidden_size)
+                elif "mlp.dense_4h_to_h" in original_name:
+                    input_size = weight_data.size // hidden_size
+                    expected_shape = (hidden_size, input_size)  
+
+                if weight_data.size == np.prod(expected_shape):
+                    reshaped_weight_data = weight_data.reshape(expected_shape)
+                    if original_name in model.state_dict():
+                        param = model.state_dict()[original_name]
+                        param.data.copy_(torch.from_numpy(reshaped_weight_data))
+                else:
+                    raise ValueError(f"Cannot reshape weight {file_name} of size {weight_data.size} into expected shape {expected_shape}.")
             else:
-                param = model.state_dict()[original_name]
-                if weight_data.size != param.numel():
-                    raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
-                
-                weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
-                with torch.no_grad():
-                    param.copy_(weight_tensor)
+                # Handle other weights
+                if original_name in model.state_dict():
+                    param = model.state_dict()[original_name]
+                    print("trying to reshape: ", original_name)
+                    reshaped_data = weight_data.reshape(param.shape)
+                    param.data.copy_(torch.from_numpy(reshaped_data))
         
-        # assign the combined QKV weights to the model
-        for layer_num, combined_weight_data in qkv_weights.items():
-            original_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
-            
-            if original_name not in model.state_dict():
-                raise KeyError(f"Parameter {original_name} not found in model.")
-            
-            param = model.state_dict()[original_name]
-            combined_weight_tensor = torch.from_numpy(combined_weight_data).view(param.shape)
-            
-            with torch.no_grad():
-                param.copy_(combined_weight_tensor)
+        # Assign the combined QKV weights to the model, if applicable
+        for layer_num, weight in qkv_weights.items():
+            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+            if qkv_name in model.state_dict():
+                param = model.state_dict()[qkv_name]
+                # Ensure the combined weight is correctly reshaped to fit the model's expectations
+                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
\ No newline at end of file
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 1d258307a7..e86c456e2a 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -369,7 +369,7 @@ def __load_hf_weights(self):
         
     def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
         """
-        Uploads the model weights to the Hugging Face Hub, with reverse conversion of weights.
+        Uploads the model to the Hugging Face Hub, with reverse conversion of weights.
         
         :param new_model_id: The new model ID for the Hugging Face Hub.
         :param model_path: The path where the FlexFlow weights are stored.
@@ -850,18 +850,42 @@ def process_and_upload_hf_model(self, model_id: str, private: bool = False):
 
     def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
         """
-        Uploads the processed PEFT model to the Hugging Face Hub.
+        Uploads the PEFT model to the Hugging Face Hub, with reverse conversion of weights.
         
-        :param new_model_id: The new repository ID on Hugging Face Hub, including the organization/user and model name (e.g., "your_username/new-peft-model-name").
-        :param private: Whether to upload the model as a private model on Hugging Face Hub.
+        :param new_model_id: The new model ID for the Hugging Face Hub.
+        :param model_path: The path where the FlexFlow weights are stored.
+        :param private: Whether to upload the model as a private model.
         """
-        print(f"Uploading processed PEFT model to Hugging Face Hub: {new_model_id}")
+        print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
+        
+        # Initialize a new Hugging Face model instance
+        hf_model = AutoModelForCausalLM.from_config(self.hf_config)
+        weights_path = self.weights_path
+        
+        # Load FlexFlow weights into the Hugging Face model instance
+        try:
+            self.model_class.load_weights_into_hf_model(hf_model, weights_path)
+        except Exception as e:
+            print(f"Error loading weights into model: {e}")
+            return
+        
+        # Save the model with converted weights to a temporary directory
+        temp_dir = tempfile.mkdtemp()
+        hf_model.save_pretrained(temp_dir)
+        
+        # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")
             return
+        
+        # Upload the model
         api = HfApi()
+        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
         api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
-        api.upload_folder(folder_path=self.cache_path, repo_id=new_model_id)
-        # api.upload_folder(folder_path=model_path, repo_id=new_model_id)
+        api.upload_folder(folder_path=temp_dir, repo_id=new_model_id)
+        
+        # Cleanup temporary directory
+        shutil.rmtree(temp_dir)
+        
         print("Upload completed successfully.")
 >>>>>>> 4b760ac8 (refactor uploading peft)

From 1157e1e159e128e1af299c3aad5fc275849cf0aa Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Fri, 23 Feb 2024 00:22:50 +0000
Subject: [PATCH 28/55] reconvert functions for llama models

---
 python/flexflow/serve/models/llama.py | 67 +++++++++++++++++++++++++++
 python/flexflow/serve/serve.py        |  5 ++
 2 files changed, 72 insertions(+)

diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 947878f706..9e32fffdfa 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -15,6 +15,10 @@
 from flexflow.core import *
 from .base import FlexFlowModel
 import random
+import re
+import os
+import numpy as np
+import torch
 
 
 class LLAMAConfig:
@@ -254,3 +258,66 @@ def convert_hf_model(model, dst_folder):
         for name, params in model.named_parameters():
             name = FlexFlowLLAMA.convert_hf_weight_name(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
+
+
+    def convert_ff_weight_name(name):
+        converted_name = (
+            name
+            .replace("w1", "gate_proj")
+            .replace("w2", "down_proj")
+            .replace("w3", "up_proj")
+            .replace("wq", "q_proj")
+            .replace("wk", "k_proj")
+            .replace("wv", "v_proj")
+            .replace("wo", "o_proj")
+            .replace("feed_forward_", "mlp.")
+            .replace("self_attn", "attention")
+            .replace("attention_norm", "input_layernorm")
+            .replace("tok_embeddings", "embed_tokens")
+            .replace("output", "lm_head")
+            
+        )
+        
+        converted_name = re.sub(r"layers_(\d+)_", r"layers.\1.", converted_name)
+        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
+        converted_name = re.sub(r"attention_(?!norm)", "self_attn.", converted_name)
+        
+        converted_name = converted_name.replace("ffn_norm", "post_attention_layernorm")
+            
+        if "lm_head" not in converted_name:
+            converted_name = "model." + converted_name   
+                 
+        return converted_name
+    
+    
+    def load_weights_into_hf_model(model, src_folder):
+        """
+        Load weights from a specified folder and apply them to a Hugging Face model.
+
+        Parameters:
+        - model: The instance of the Hugging Face model to load weights into.
+        - src_folder: The path to the folder containing the weight files.
+        """
+        for file_name in os.listdir(src_folder):
+            weight_path = os.path.join(src_folder, file_name)
+            if weight_path.endswith("rev_sha.txt"):
+                print("skipping rev_sha.txt")
+                continue
+            else:
+                original_name = FlexFlowLLAMA.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print(f"Converting weight name: {file_name} to {original_name}")
+            
+            if not os.path.exists(weight_path):
+                raise FileNotFoundError(f"No weight file found for {file_name}")
+            
+            weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
+            if original_name not in model.state_dict():
+                raise KeyError(f"Parameter {original_name} not found in model.")
+            param = model.state_dict()[original_name]
+            
+            if weight_data.size != param.numel():
+                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+            
+            weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+            with torch.no_grad():
+                model.state_dict()[original_name].copy_(weight_tensor)
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index e86c456e2a..0b8a78e94e 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -398,6 +398,11 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         for file_name in tokenizer_files:
             shutil.copy(os.path.join(self.tokenizer_path, file_name), temp_dir)
             
+        # Delete rev_sha.txt from the temporary directory if it exists
+        rev_sha_path = os.path.join(temp_dir, 'rev_sha.txt')
+        if os.path.exists(rev_sha_path):
+            os.remove(rev_sha_path)
+            
         # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")

From e4ba21227b2a5a282f71e88d72e93d0e3243496c Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Sat, 24 Feb 2024 10:48:42 +0000
Subject: [PATCH 29/55] finish weight convert for falcon models

---
 python/flexflow/serve/models/falcon.py | 109 +++++++++++++------------
 1 file changed, 58 insertions(+), 51 deletions(-)

diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 2e82179648..7d6862308a 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -286,7 +286,8 @@ def convert_ff_weight_name(name):
         converted_name = converted_name.replace("mlp_dense_h_to_4h", "mlp.dense_h_to_4h")
         converted_name = converted_name.replace("mlp_dense_4h_to_h", "mlp.dense_4h_to_h")
         converted_name = converted_name.replace("attention_wo", "self_attention.dense")
-        
+        if name.startswith("ln") or name.startswith("word_embeddings"):
+            converted_name = "transformer." + converted_name
         converted_name = re.sub(r"layers_(\d+)_", r"transformer.h.\1.", converted_name)
         converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
 
@@ -303,78 +304,84 @@ def load_weights_into_hf_model(model, src_folder):
         - config: The configuration object for the model.
         """
         
+        hidden_size = model.config.hidden_size
+        n_head = (
+            model.config.n_head
+            if "n_head" in model.config.__dict__
+            else model.config.num_attention_heads
+        )
         
-        print("Model hidden size:", model.config.hidden_size)
-        print("Model num_attention_heads:", model.config.num_attention_heads)
+        print("Model hidden size:", hidden_size)
+        print("Model num_attention_heads:", n_head)
         
-        hidden_size = model.config.hidden_size
-        num_attention_heads = model.config.num_attention_heads
-        hidden_size_per_head = hidden_size // num_attention_heads
+        num_attention_heads = n_head
+        hidden_size_per_head = hidden_size // n_head
         intermediate_size = hidden_size * 4
         
         qkv_weights = {}
         
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
-            print("converting weight file: ", weight_path)
+            print("\nProcessing weight file:", weight_path)
             original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
-            print("weight name after conversion from flexflow: ", original_name)
+            print("Converted weight name:", original_name)
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
             
             weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
+            print(f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}")
             
-            if "attention_w" in original_name:
+            # for q,k,v weights, store in dict
+            if ("attention_wq" in original_name) or ("attention_wk" in original_name) or ("attention_wv" in original_name):
                 qkv_match = re.search("(wq|wk|wv)", file_name)
                 qkv_type = qkv_match.group(0) if qkv_match else None
                 layer_num_match = re.search(r"transformer.h.(\d+)", original_name)
                 layer_num = int(layer_num_match.group(1)) if layer_num_match else None
+                print(f"QKV type: {qkv_type}, Layer number: {layer_num}")
                 
                 if layer_num is not None:
                     if layer_num not in qkv_weights:
-                        # For each layer, initialize space for Q, K, V weights for all heads
-                        # Each head has hidden_size_per_head elements, and there are num_attention_heads heads
-                        # For Q, K, V together, it's 3 * hidden_size_per_head * num_attention_heads
-                        qkv_shape = (3 * hidden_size_per_head * num_attention_heads, hidden_size)
+                        # qkv_shape = (hidden_size_per_head * num_attention_heads, hidden_size)
+                        per_type_space = hidden_size_per_head * n_head
+                        
+                        qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+                        if qkv_name in model.state_dict():
+                            qkv_param_size = model.state_dict()[qkv_name].shape[0]
+                        qkv_shape = (qkv_param_size, hidden_size)
                         qkv_weights[layer_num] = np.zeros(qkv_shape)
-                    
-                    # Calculate index for Q, K, or V weight segment within the combined QKV weight
+                        print(f"Initialized QKV shape for layer {layer_num}: {qkv_shape}")
+                        
                     type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
-                    offset = type_index * hidden_size_per_head * num_attention_heads
-                    # Reshape the weight data to fit into the combined QKV weight matrix
-                    reshaped_data = weight_data.reshape(-1, hidden_size)
-                    qkv_weights[layer_num][offset:offset+reshaped_data.shape[0], :] = reshaped_data
+                    offset = type_index * per_type_space
+                    print("offset for this weight is: ", offset)
+                    ## dim 0 sizes: 
+                    dim_wq = hidden_size
+                    dim_wk = hidden_size // n_head
+                    dim_wv = hidden_size // n_head
+                    print(dim_wq, dim_wk, dim_wv)
+                    
+                    try:
+                        expected_shape = (weight_data.size // hidden_size, hidden_size)
+                        reshaped_data = weight_data.reshape(expected_shape)
+                        print(f"Reshaped QKV weights for {qkv_type} in layer {layer_num} with shape {expected_shape}.")
+                    except ValueError as e:
+                        print(f"Error reshaping {qkv_type} weights for layer {layer_num}: {e}")
+                        print(f"Attempting to reshape data of size {weight_data.size} into shape (-1, {hidden_size})")
+                        
+                        
+                    try:
+                        if qkv_type == "wq":
+                            qkv_weights[layer_num][0:dim_wq, :] = reshaped_data
+                        elif qkv_type == "wk":
+                            qkv_weights[layer_num][dim_wq:dim_wk+dim_wq, :] = reshaped_data
+                        else:
+                            qkv_weights[layer_num][dim_wq+dim_wk:, :] = reshaped_data
+                    except ValueError as e:
+                        print(f"Error assigning {qkv_type} weights for layer {layer_num}: {e}")
+                continue
             
-            elif "mlp.dense_h_to_4h" in original_name or "mlp.dense_4h_to_h" in original_name:
-                # Handle MLP weights
-                if "mlp.dense_h_to_4h" in original_name:
-                    total_elements = weight_data.size
-                    output_size = total_elements // hidden_size
-                    expected_shape = (output_size, hidden_size)
-                elif "mlp.dense_4h_to_h" in original_name:
-                    input_size = weight_data.size // hidden_size
-                    expected_shape = (hidden_size, input_size)  
-
-                if weight_data.size == np.prod(expected_shape):
-                    reshaped_weight_data = weight_data.reshape(expected_shape)
-                    if original_name in model.state_dict():
-                        param = model.state_dict()[original_name]
-                        param.data.copy_(torch.from_numpy(reshaped_weight_data))
-                else:
-                    raise ValueError(f"Cannot reshape weight {file_name} of size {weight_data.size} into expected shape {expected_shape}.")
-            else:
-                # Handle other weights
-                if original_name in model.state_dict():
-                    param = model.state_dict()[original_name]
-                    print("trying to reshape: ", original_name)
-                    reshaped_data = weight_data.reshape(param.shape)
-                    param.data.copy_(torch.from_numpy(reshaped_data))
-        
-        # Assign the combined QKV weights to the model, if applicable
-        for layer_num, weight in qkv_weights.items():
-            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
-            if qkv_name in model.state_dict():
-                param = model.state_dict()[qkv_name]
-                # Ensure the combined weight is correctly reshaped to fit the model's expectations
-                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
\ No newline at end of file
+            # for weights that are not q,k,v, get the param names
+            param = model.state_dict().get(original_name, None)
+            if param is None:
+                print(f"Warning: {original_name} not found i
\ No newline at end of file

From edaaecab788b6c55be95f5d5f86167cac1399f01 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 26 Feb 2024 13:16:10 +0000
Subject: [PATCH 30/55] simplify upload script

---
 inference/utils/download_upload_hf.py | 29 +++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index c400355c0a..f4e6db7775 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -8,7 +8,7 @@ def parse_args():
     parser = argparse.ArgumentParser(description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
     parser.add_argument("model_name", type=str, help="Original Hugging Face model ID to download and process (e.g., 'facebook/opt-125m').")
     parser.add_argument("--new-model-id", type=str, required=True, help="New Hugging Face Hub model ID for upload (e.g., 'your_username/new-model-name').")
-    parser.add_argument("--cache-folder", type=str, default="./model_cache", help="Folder to use to store and process the model(s) assets in FlexFlow format.")
+    parser.add_argument("--cache-folder", type=str, default="~/.cache/flexflow", help="Folder to use to store and process the model(s) assets in FlexFlow format.")
     parser.add_argument("--private", action="store_true", help="Whether to upload the processed model as a private model on Hugging Face Hub.")
     parser.add_argument("--refresh-cache", action="store_true", help="Use this flag to force the refresh of the model(s) weights/tokenizer cache.")
     parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights.")
@@ -37,13 +37,30 @@ def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
 
 
 def main():
-    args = parse_args()
-    llm = download_and_process_model(args.model_name, args.cache_folder, args.refresh_cache, args.full_precision)
-    upload_processed_model_to_hub(llm, args.new_model_id, args.cache_folder, args.private)
+    model_name = "mosaicml/mpt-7b"
+    # new_model_id = "your_username/new-model-name"
+    new_model_id = "aprilyyt/upload-mpt"  
+    cache_folder = "~/.cache/flexflow"
+    private = True 
+    refresh_cache = False
+    full_precision = True 
 
+    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
+    print(f"Downloading and processing model: {model_name}")
+    llm = ff.LLM(
+        model_name=model_name,
+        data_type=data_type,
+        cache_path=cache_folder,
+        refresh_cache=refresh_cache,
+    )
+    llm.download_hf_weights_if_needed()
+    llm.download_hf_tokenizer_if_needed()
+    llm.download_hf_config()
+
+    print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
+    llm.upload_hf_model(new_model_id, cache_folder, private=private)
+    print("Upload completed successfully.")
 
 if __name__ == "__main__":
     main()
 
-
-# python download_upload_hf.py facebook/opt-125m --new-model-id username/modelname --cache-folder ./model_cache --private
\ No newline at end of file

From 4aea5e825594e6e30b6b47767da29cc7c3974f50 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 26 Feb 2024 13:17:10 +0000
Subject: [PATCH 31/55] fix falcon typo

---
 python/flexflow/serve/models/falcon.py | 31 +++++++++++++++++---------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 7d6862308a..90d7fcfb52 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -323,8 +323,12 @@ def load_weights_into_hf_model(model, src_folder):
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
             print("\nProcessing weight file:", weight_path)
-            original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
-            print("Converted weight name:", original_name)
+            if weight_path.endswith("rev_sha.txt"):
+                print("skipping rev_sha.txt")
+                continue
+            else:
+                original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print("Converted weight name:", original_name)
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
@@ -342,8 +346,6 @@ def load_weights_into_hf_model(model, src_folder):
                 
                 if layer_num is not None:
                     if layer_num not in qkv_weights:
-                        # qkv_shape = (hidden_size_per_head * num_attention_heads, hidden_size)
-                        per_type_space = hidden_size_per_head * n_head
                         
                         qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
                         if qkv_name in model.state_dict():
@@ -353,13 +355,10 @@ def load_weights_into_hf_model(model, src_folder):
                         print(f"Initialized QKV shape for layer {layer_num}: {qkv_shape}")
                         
                     type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
-                    offset = type_index * per_type_space
-                    print("offset for this weight is: ", offset)
                     ## dim 0 sizes: 
                     dim_wq = hidden_size
                     dim_wk = hidden_size // n_head
                     dim_wv = hidden_size // n_head
-                    print(dim_wq, dim_wk, dim_wv)
                     
                     try:
                         expected_shape = (weight_data.size // hidden_size, hidden_size)
@@ -369,7 +368,6 @@ def load_weights_into_hf_model(model, src_folder):
                         print(f"Error reshaping {qkv_type} weights for layer {layer_num}: {e}")
                         print(f"Attempting to reshape data of size {weight_data.size} into shape (-1, {hidden_size})")
                         
-                        
                     try:
                         if qkv_type == "wq":
                             qkv_weights[layer_num][0:dim_wq, :] = reshaped_data
@@ -381,7 +379,20 @@ def load_weights_into_hf_model(model, src_folder):
                         print(f"Error assigning {qkv_type} weights for layer {layer_num}: {e}")
                 continue
             
-            # for weights that are not q,k,v, get the param names
+            # for weights that are not q,k,v, get the param names and assign weights accordingly
             param = model.state_dict().get(original_name, None)
             if param is None:
-                print(f"Warning: {original_name} not found i
\ No newline at end of file
+                print(f"Warning: {original_name} not found in directory")
+            reshaped_data = weight_data.reshape(param.shape)
+            param.data.copy_(torch.from_numpy(reshaped_data))
+            
+        # Assign the combined QKV weights to the model
+        for layer_num, weight in qkv_weights.items():
+            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+            if qkv_name in model.state_dict():
+                param = model.state_dict()[qkv_name]
+                # Ensure the combined weight is correctly reshaped to fit the model's expectations
+                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
+            
+            
+            
\ No newline at end of file

From a67d82433224dd9f0ab6ecb99a2809447b097e2e Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 26 Feb 2024 13:17:47 +0000
Subject: [PATCH 32/55] mpt models, minor errs to be fixed

---
 python/flexflow/serve/models/mpt.py | 94 ++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 1d1837c478..306fc0222d 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -14,7 +14,8 @@
 
 from flexflow.core import *
 from .base import FlexFlowModel
-import random, torch, shutil
+import random, torch, shutil, os, re
+import numpy as np
 
 
 class MPTConfig:
@@ -284,3 +285,94 @@ def convert_hf_model(model, dst_folder):
             os.path.join(dst_folder, "wte.weight"),
             os.path.join(dst_folder, "lm_head.weight"),
         )
+
+
+    def convert_ff_weight_name(name):
+        # Reverses the conversion logic for MPT model weights
+        converted_name = name
+        if "norm_f" in converted_name or "wte" in converted_name:
+            converted_name = converted_name.replece("_", ".")
+            
+        converted_name = converted_name.replace("attention_wo", "attn.out_proj")
+        converted_name = converted_name.replace("ffn_", "ffn.")
+        converted_name = re.sub(r"layers_(\d+)_", r"transformer.blocks.\1.", converted_name)
+        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
+        
+        return converted_name
+
+    def load_weights_into_hf_model(model, src_folder):
+        """
+        Load weights from a specified folder and apply them to a Hugging Face MPT model.
+        
+        Parameters:
+        - model: The instance of the Hugging Face model to load the weights into.
+        - src_folder: The path to the folder containing the weight files.
+        """
+        
+        d_model = model.config.d_model
+        print("dimension of the model is: ", d_model)
+        
+        qkv_weights = {}
+        
+        for file_name in os.listdir(src_folder):
+            weight_path = os.path.join(src_folder, file_name)
+            if weight_path.endswith("rev_sha.txt"):
+                print("skipping rev_sha.txt")
+                continue
+            elif "lm_head" in weight_path:
+                # todo: double check how to handle lm_head in uploading mpt models
+                print("skipping lm_head.weight")
+                continue
+            else:
+                original_name = FlexFlowMPT.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print("\nconverting weights name of: ", file_name, "to ", original_name)
+                
+            if not os.path.exists(weight_path):
+                raise FileNotFoundError(f"No weight file found for {file_name}")
+            
+            weight_data = np.fromfile(weight_path, dtype=np.float32)
+            
+            # Special handling for combined QKV weights
+            if ("wq" in file_name) or ("wk" in file_name) or ("wv" in file_name):
+                layer_num_match = re.search(r"layers\.(\d+)", original_name)
+                layer_num = int(layer_num_match.group(1)) if layer_num_match else None
+                qkv_type = original_name.split("_")[-2]
+                
+                if layer_num is not None:
+                    qkv_key = f"layers.{layer_num}.attn_Wqkv"
+                    # initialize qkv layer in dict
+                    if qkv_key not in qkv_weights:
+                        qkv_weights[qkv_key] = {'wq': None, 'wk': None, 'wv': None}
+                        print(f"Initialized QKV layer {layer_num}")
+                    # assign weights into dict
+                    qkv_weights[qkv_key][qkv_type] = weight_data
+                
+                continue
+            
+            # for weights that are not q,k,v, get the param names and assign weights accordingly
+            param = model.state_dict().get(original_name, None)
+            if weight_data.size != param.numel():
+                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+            
+            weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+            with torch.no_grad():
+                model.state_dict()[original_name].copy_(weight_tensor)
+
+                    
+        for qkv_key, weights_dict in qkv_weights.items():
+            wq, wk, wv = weights_dict['wq'], weights_dict['wk'], weights_dict['wv']
+            if None in (wq, wk, wv):
+                raise ValueError(f"Missing weights for {qkv_key}")
+
+            combined_qkv = np.concatenate([wq, wk, wv], axis=0)
+            qkv_name = qkv_key.replace("layers.", "transformer.blocks.")+".weight"
+            
+            param_shape = model.state_dict()[qkv_name].shape
+            combined_qkv_reshaped = combined_qkv.reshape(param_shape)
+
+            model.state_dict()[qkv_name].copy_(torch.from_numpy(combined_qkv_reshaped))
+
+            print(f"Assigned combined QKV weights to {qkv_key}.")
+                
+                
+                
\ No newline at end of file

From 6382448903718daf0a244e73cfbd4096de33fff8 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 26 Feb 2024 13:22:22 +0000
Subject: [PATCH 33/55] starcoder models, minor errs to be fixed

---
 python/flexflow/serve/models/starcoder.py | 108 ++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 80b4be10bb..f642e3dd37 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -264,3 +264,111 @@ def convert_hf_model(model, dst_folder):
         model.lm_head.weight.detach().cpu().numpy().tofile(
             os.path.join(dst_folder, "lm_head.weight")
         )
+        
+        
+    def convert_ff_weight_name(name):
+        """
+        Convert weight names from FlexFlow format back to Hugging Face format.
+        """
+        # Example conversion logic, adjust as needed
+        if "attention_wq" in name or "attention_wk" in name or "attention_wv" in name:
+            converted_name = converted_name.replace("attention_wq", "attn.c_attn").replace("attention_wk", "attn.c_attn").replace("attention_wv", "attn.c_attn")
+        elif "attention_wo" in name:
+            converted_name = converted_name.replace("attention_wo", "attn.c_proj")
+        
+        converted_name = re.sub(r"layers_(\d+)_", r"transformer.h.\1.", converted_name)
+
+        return converted_name
+    
+    
+    def load_weights_into_hf_model(model, src_folder):
+        """
+        Load weights from a specified folder and apply them to a Hugging Face model.
+
+        Parameters:
+        - model: The instance of the Hugging Face model to load the weights into.
+        - src_folder: The path to the folder containing the weight files.
+        """
+
+        for file_name in os.listdir(src_folder):
+            weight_path = os.path.join(src_folder, file_name)
+            if weight_path.endswith("rev_sha.txt"):
+                print("skipping rev_sha.txt")
+                continue
+            else:
+                original_name = FlexFlowLLAMA.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print(f"Converting weight name: {file_name} to {original_name}")
+            
+            if not os.path.exists(weight_path):
+                raise FileNotFoundError(f"No weight file found for {file_name}")
+
+            weight_data = np.fromfile(weight_path, dtype=np.float32)
+
+            # Find the parameter in the model
+            param = model.state_dict().get(original_name)
+            if param is None:
+                print(f"Warning: {original_name} not found in model parameters.")
+                continue
+
+            # Special handling for q, k, v weights
+            if ("attention_wq" in original_name) or ("attention_wk" in original_name) or ("attention_wv" in original_name):
+                qkv_match = re.search("(wq|wk|wv)", file_name)
+                qkv_type = qkv_match.group(0) if qkv_match else None
+                layer_num_match = re.search(r"transformer.h.(\d+)", original_name)
+                layer_num = int(layer_num_match.group(1)) if layer_num_match else None
+                print(f"QKV type: {qkv_type}, Layer number: {layer_num}")
+                
+                if layer_num is not None:
+                    if layer_num not in qkv_weights:
+                        
+                        qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+                        if qkv_name in model.state_dict():
+                            qkv_param_size = model.state_dict()[qkv_name].shape[0]
+                        qkv_shape = (qkv_param_size, hidden_size)
+                        qkv_weights[layer_num] = np.zeros(qkv_shape)
+                        print(f"Initialized QKV shape for layer {layer_num}: {qkv_shape}")
+                        
+                    type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
+                    ## dim 0 sizes: 
+                    dim_wq = hidden_size
+                    dim_wk = hidden_size // n_head
+                    dim_wv = hidden_size // n_head
+                    
+                    try:
+                        expected_shape = (weight_data.size // hidden_size, hidden_size)
+                        reshaped_data = weight_data.reshape(expected_shape)
+                        print(f"Reshaped QKV weights for {qkv_type} in layer {layer_num} with shape {expected_shape}.")
+                    except ValueError as e:
+                        print(f"Error reshaping {qkv_type} weights for layer {layer_num}: {e}")
+                        print(f"Attempting to reshape data of size {weight_data.size} into shape (-1, {hidden_size})")
+                        
+                    try:
+                        if qkv_type == "wq":
+                            qkv_weights[layer_num][0:dim_wq, :] = reshaped_data
+                        elif qkv_type == "wk":
+                            qkv_weights[layer_num][dim_wq:dim_wk+dim_wq, :] = reshaped_data
+                        else:
+                            qkv_weights[layer_num][dim_wq+dim_wk:, :] = reshaped_data
+                    except ValueError as e:
+                        print(f"Error assigning {qkv_type} weights for layer {layer_num}: {e}")
+                continue
+
+
+            # Handle other parameters
+            param = model.state_dict().get(original_name)
+            if param is None:
+                print(f"Warning: {original_name} not found in model parameters.")
+                continue
+            reshaped_weight_data = weight_data.reshape(param.shape)
+            param.data.copy_(torch.from_numpy(reshaped_weight_data))
+            
+        
+        # Assign the combined QKV weights to the model
+        for layer_num, weight in qkv_weights.items():
+            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
+            if qkv_name in model.state_dict():
+                param = model.state_dict()[qkv_name]
+                # Ensure the combined weight is correctly reshaped to fit the model's expectations
+                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
+                
+       

From 614de32f86f58d16f3f7704943752e78e117108f Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 27 Feb 2024 01:32:21 +0000
Subject: [PATCH 34/55] fixed issues with mpt and starcoder models

---
 python/flexflow/serve/models/mpt.py       |   4 +-
 python/flexflow/serve/models/starcoder.py | 126 +++++++++++-----------
 2 files changed, 61 insertions(+), 69 deletions(-)

diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 306fc0222d..fd81ae6d19 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -291,7 +291,7 @@ def convert_ff_weight_name(name):
         # Reverses the conversion logic for MPT model weights
         converted_name = name
         if "norm_f" in converted_name or "wte" in converted_name:
-            converted_name = converted_name.replece("_", ".")
+            converted_name = converted_name.replace("_", ".").replace("norm.f", "norm_f")
             
         converted_name = converted_name.replace("attention_wo", "attn.out_proj")
         converted_name = converted_name.replace("ffn_", "ffn.")
@@ -373,6 +373,4 @@ def load_weights_into_hf_model(model, src_folder):
             model.state_dict()[qkv_name].copy_(torch.from_numpy(combined_qkv_reshaped))
 
             print(f"Assigned combined QKV weights to {qkv_key}.")
-                
-                
                 
\ No newline at end of file
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index f642e3dd37..11dfc1744e 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -14,7 +14,8 @@
 
 from flexflow.core import *
 from .base import FlexFlowModel
-import random, torch
+import random, torch, re
+import numpy as np
 
 
 class STARCODERConfig:
@@ -270,13 +271,16 @@ def convert_ff_weight_name(name):
         """
         Convert weight names from FlexFlow format back to Hugging Face format.
         """
+        converted_name = name
         # Example conversion logic, adjust as needed
-        if "attention_wq" in name or "attention_wk" in name or "attention_wv" in name:
-            converted_name = converted_name.replace("attention_wq", "attn.c_attn").replace("attention_wk", "attn.c_attn").replace("attention_wv", "attn.c_attn")
-        elif "attention_wo" in name:
+        if "attention_wo" in name:
             converted_name = converted_name.replace("attention_wo", "attn.c_proj")
+            
+        converted_name = converted_name.replace("mlp_", "mlp.").replace("_ln_f", ".ln_f").replace("_wpe", ".wpe").replace("_wte", ".wte")
         
         converted_name = re.sub(r"layers_(\d+)_", r"transformer.h.\1.", converted_name)
+        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
+        
 
         return converted_name
     
@@ -289,86 +293,76 @@ def load_weights_into_hf_model(model, src_folder):
         - model: The instance of the Hugging Face model to load the weights into.
         - src_folder: The path to the folder containing the weight files.
         """
+        
+        hidden_size = model.config.hidden_size
+        n_head = (
+            model.config.n_head
+            if "n_head" in model.config.__dict__
+            else model.config.num_attention_heads
+        )
+        
+        print("Model hidden size:", hidden_size)
+        print("Model num_attention_heads:", n_head)
+        
+        num_attention_heads = n_head
+        hidden_size_per_head = hidden_size // n_head
+        
+        qkv_weights = {}
 
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
+            print("\nProcessing weight file:", weight_path)
             if weight_path.endswith("rev_sha.txt"):
                 print("skipping rev_sha.txt")
                 continue
             else:
-                original_name = FlexFlowLLAMA.convert_ff_weight_name(file_name.replace('.bin', ''))
-                print(f"Converting weight name: {file_name} to {original_name}")
+                original_name = FlexFlowSTARCODER.convert_ff_weight_name(file_name.replace('.bin', ''))
+                print(f"Converted weight name: {file_name} to {original_name}")
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
 
             weight_data = np.fromfile(weight_path, dtype=np.float32)
-
-            # Find the parameter in the model
-            param = model.state_dict().get(original_name)
-            if param is None:
-                print(f"Warning: {original_name} not found in model parameters.")
-                continue
-
-            # Special handling for q, k, v weights
+            print(f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}")
+            
+            # Special handling for combined QKV weights
             if ("attention_wq" in original_name) or ("attention_wk" in original_name) or ("attention_wv" in original_name):
-                qkv_match = re.search("(wq|wk|wv)", file_name)
-                qkv_type = qkv_match.group(0) if qkv_match else None
-                layer_num_match = re.search(r"transformer.h.(\d+)", original_name)
+                weight_bias = ".weight" if ".weight" in original_name else ".bias"
+                layer_num_match = re.search(r"layers\_(\d+)", file_name)
                 layer_num = int(layer_num_match.group(1)) if layer_num_match else None
-                print(f"QKV type: {qkv_type}, Layer number: {layer_num}")
+                qkv_type = file_name.split("_")[-2]
+                qkv_name = f"transformer.h.{layer_num}.attn.c_attn" + weight_bias
                 
                 if layer_num is not None:
-                    if layer_num not in qkv_weights:
-                        
-                        qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
-                        if qkv_name in model.state_dict():
-                            qkv_param_size = model.state_dict()[qkv_name].shape[0]
-                        qkv_shape = (qkv_param_size, hidden_size)
-                        qkv_weights[layer_num] = np.zeros(qkv_shape)
-                        print(f"Initialized QKV shape for layer {layer_num}: {qkv_shape}")
-                        
-                    type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
-                    ## dim 0 sizes: 
-                    dim_wq = hidden_size
-                    dim_wk = hidden_size // n_head
-                    dim_wv = hidden_size // n_head
-                    
-                    try:
-                        expected_shape = (weight_data.size // hidden_size, hidden_size)
-                        reshaped_data = weight_data.reshape(expected_shape)
-                        print(f"Reshaped QKV weights for {qkv_type} in layer {layer_num} with shape {expected_shape}.")
-                    except ValueError as e:
-                        print(f"Error reshaping {qkv_type} weights for layer {layer_num}: {e}")
-                        print(f"Attempting to reshape data of size {weight_data.size} into shape (-1, {hidden_size})")
-                        
-                    try:
-                        if qkv_type == "wq":
-                            qkv_weights[layer_num][0:dim_wq, :] = reshaped_data
-                        elif qkv_type == "wk":
-                            qkv_weights[layer_num][dim_wq:dim_wk+dim_wq, :] = reshaped_data
-                        else:
-                            qkv_weights[layer_num][dim_wq+dim_wk:, :] = reshaped_data
-                    except ValueError as e:
-                        print(f"Error assigning {qkv_type} weights for layer {layer_num}: {e}")
+                    # initialize qkv layer in dict
+                    if qkv_name not in qkv_weights:
+                        qkv_weights[qkv_name] = {'wq': None, 'wk': None, 'wv': None}
+                        print(f"Initialized QKV layer {layer_num}")
+                    # assign weights into dict
+                    qkv_weights[qkv_name][qkv_type] = weight_data
+                    print(f"attached qkv weight {qkv_name}")
+                
                 continue
 
 
-            # Handle other parameters
-            param = model.state_dict().get(original_name)
-            if param is None:
-                print(f"Warning: {original_name} not found in model parameters.")
-                continue
-            reshaped_weight_data = weight_data.reshape(param.shape)
-            param.data.copy_(torch.from_numpy(reshaped_weight_data))
+            # for weights that are not q,k,v, get the param names and assign weights accordingly
+            param = model.state_dict().get(original_name, None)
+            if weight_data.size != param.numel():
+                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
             
-        
-        # Assign the combined QKV weights to the model
-        for layer_num, weight in qkv_weights.items():
-            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
-            if qkv_name in model.state_dict():
-                param = model.state_dict()[qkv_name]
-                # Ensure the combined weight is correctly reshaped to fit the model's expectations
-                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
+            weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+            with torch.no_grad():
+                model.state_dict()[original_name].copy_(weight_tensor)
+                print(f"Assigned weight {original_name} successfully!")
+                
                 
-       
+        for qkv_name, weights_dict in qkv_weights.items():
+            combined_qkv = np.concatenate([qkv_weights[qkv_name]['wq'], qkv_weights[qkv_name]['wk'], qkv_weights[qkv_name]['wv']], axis=0)
+            param_shape = model.state_dict()[qkv_name].shape
+            combined_qkv_reshaped = combined_qkv.reshape(param_shape)
+            print(f"reshaped qkv weights shape is: {combined_qkv_reshaped.shape}")
+
+            model.state_dict()[qkv_name].copy_(torch.from_numpy(combined_qkv_reshaped))
+            print(f"Assigned combined QKV weights to {qkv_name}.")
+        
+        
\ No newline at end of file

From 551e1198d167d2c9c4563d5195be62156bc272d6 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 27 Feb 2024 01:34:15 +0000
Subject: [PATCH 35/55] modify hf uploading script

---
 inference/utils/download_upload_hf.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index f4e6db7775..c3d8df841c 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -37,9 +37,13 @@ def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
 
 
 def main():
-    model_name = "mosaicml/mpt-7b"
+    model_name = "bigcode/starcoderbase-1b"
+    # model_name = "mosaicml/mpt-7b"
+    
     # new_model_id = "your_username/new-model-name"
-    new_model_id = "aprilyyt/upload-mpt"  
+    new_model_id = "aprilyyt/upload-starcoder"  
+    # new_model_id = "aprilyyt/upload-mpt"  
+    
     cache_folder = "~/.cache/flexflow"
     private = True 
     refresh_cache = False

From 2da5aa1923c0c8ba5c091cf07f69ea53080738b1 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 27 Feb 2024 01:35:22 +0000
Subject: [PATCH 36/55] modify hf uploading script

---
 inference/utils/download_upload_peft.py | 35 ++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/inference/utils/download_upload_peft.py b/inference/utils/download_upload_peft.py
index 6e174eacf7..6870b0e04c 100644
--- a/inference/utils/download_upload_peft.py
+++ b/inference/utils/download_upload_peft.py
@@ -34,10 +34,37 @@ def upload_peft_model_to_hub(peft, new_model_id, cache_folder, private):
     print("Upload completed successfully.")
     
 
+# def main():
+#     args = parse_args()
+#     peft = download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
+#     upload_peft_model_to_hub(peft, args.new_model_id, args.cache_folder, args.private)
+
+# if __name__ == "__main__":
+#     main()
+
+
 def main():
-    args = parse_args()
-    peft = download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
-    upload_peft_model_to_hub(peft, args.new_model_id, args.cache_folder, args.private)
+    model_name = "meta-llama/Llama-2-7b"
+    new_model_id = "your_username/new-model-name"
+    cache_folder = "~/.cache/flexflow"
+    private = True 
+    refresh_cache = False
+    full_precision = True 
+
+    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
+    print(f"Downloading and processing peft model: {peft_model_id}")
+    peft = ff.PEFT(
+                peft_model_id,
+                data_type=data_type,
+                cache_path=args.cache_folder,
+                refresh_cache=args.refresh_cache,
+            )
+    peft.download_hf_weights_if_needed()
+    peft.download_hf_config()
+
+    print(f"Uploading processed model to Hugging Face Hub: {peft_model_id}")
+    peft.upload_hf_model(peft_model_id, cache_folder, private=private)
+    print("Upload completed successfully.")
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From ef471effa465da2e7d686264ff158c493118f9bb Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 27 Feb 2024 01:37:55 +0000
Subject: [PATCH 37/55] modify hf uploading main

---
 inference/utils/download_upload_hf.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index c3d8df841c..40551c66b8 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -37,13 +37,8 @@ def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
 
 
 def main():
-    model_name = "bigcode/starcoderbase-1b"
-    # model_name = "mosaicml/mpt-7b"
-    
-    # new_model_id = "your_username/new-model-name"
-    new_model_id = "aprilyyt/upload-starcoder"  
-    # new_model_id = "aprilyyt/upload-mpt"  
-    
+    model_name = "meta-llama/Llama-2-7b"
+    new_model_id = "your_username/new-model-name"
     cache_folder = "~/.cache/flexflow"
     private = True 
     refresh_cache = False

From edb2238af0ebb6dbc3779caa4dfdf75ca3488b94 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Fri, 1 Mar 2024 16:02:40 +0000
Subject: [PATCH 38/55] add assertion for base model

---
 python/flexflow/serve/models/base.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py
index 17bb894250..fe39fd30bf 100644
--- a/python/flexflow/serve/models/base.py
+++ b/python/flexflow/serve/models/base.py
@@ -37,3 +37,9 @@ def convert_hf_weight_name(name):
 
     def convert_hf_model(model, dst_folder):
         assert False, "Not implemented yet"
+        
+    def convert_ff_weight_name(name):
+        assert False, "Not implemented yet"
+        
+    def load_weights_into_hf_model(model, src_folder):
+        assert False, "Not implemented yet"

From 44f43f5adfb72325876d0f8a49c6ba08ffd56d58 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Mon, 19 Feb 2024 13:55:57 +0000
Subject: [PATCH 39/55] rebase on peft

---
 python/flexflow/serve/serve.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 42cee2aa80..2d61461421 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -406,14 +406,21 @@ def __load_hf_weights(self):
         
     def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
         """
+<<<<<<< HEAD
         Uploads the model to the Hugging Face Hub, with reverse conversion of weights.
+=======
+        Uploads the model weights to the Hugging Face Hub, with reverse conversion of weights.
+>>>>>>> cdf24eb0 (modify upload logic and add reconvert functions for opt models)
         
         :param new_model_id: The new model ID for the Hugging Face Hub.
         :param model_path: The path where the FlexFlow weights are stored.
         :param private: Whether to upload the model as a private model.
         """
         print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
+<<<<<<< HEAD
         print("tokenizer path is: ", self.tokenizer_path)
+=======
+>>>>>>> cdf24eb0 (modify upload logic and add reconvert functions for opt models)
         
         # Initialize a new Hugging Face model instance
         hf_model = AutoModelForCausalLM.from_config(self.hf_config)
@@ -430,6 +437,7 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         temp_dir = tempfile.mkdtemp()
         hf_model.save_pretrained(temp_dir)
         
+<<<<<<< HEAD
         # Copy the tokenizer files to the temporary directory
         tokenizer_files = [f for f in os.listdir(self.tokenizer_path)]
         for file_name in tokenizer_files:
@@ -440,6 +448,8 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         if os.path.exists(rev_sha_path):
             os.remove(rev_sha_path)
             
+=======
+>>>>>>> cdf24eb0 (modify upload logic and add reconvert functions for opt models)
         # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")

From f051efab6bc30dabd5e8489e562e6a9fff8c0c32 Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Tue, 2 Apr 2024 15:22:00 -0700
Subject: [PATCH 40/55] some fixes

---
 inference/utils/download_upload_hf.py   |   5 +-
 inference/utils/download_upload_peft.py |  20 ++--
 python/flexflow/serve/serve.py          | 125 ++++--------------------
 3 files changed, 30 insertions(+), 120 deletions(-)

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index 40551c66b8..0432dd2cd9 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -40,9 +40,9 @@ def main():
     model_name = "meta-llama/Llama-2-7b"
     new_model_id = "your_username/new-model-name"
     cache_folder = "~/.cache/flexflow"
-    private = True 
+    private = True
     refresh_cache = False
-    full_precision = True 
+    full_precision = True
 
     data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
     print(f"Downloading and processing model: {model_name}")
@@ -62,4 +62,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/inference/utils/download_upload_peft.py b/inference/utils/download_upload_peft.py
index 6870b0e04c..5faf87f359 100644
--- a/inference/utils/download_upload_peft.py
+++ b/inference/utils/download_upload_peft.py
@@ -26,13 +26,13 @@ def download_and_process_peft_model(peft_model_id, cache_folder, refresh_cache,
     peft.download_hf_config()
     # any necessary conversion or processing by FlexFlow happens here
     return peft
-    
-    
+
+
 def upload_peft_model_to_hub(peft, new_model_id, cache_folder, private):
     print(f"Uploading peft model to HuggingFace Hub: {new_model_id}")
     peft.upload_hf_model(new_model_id, cache_folder, private=private)
     print("Upload completed successfully.")
-    
+
 
 # def main():
 #     args = parse_args()
@@ -47,18 +47,18 @@ def main():
     model_name = "meta-llama/Llama-2-7b"
     new_model_id = "your_username/new-model-name"
     cache_folder = "~/.cache/flexflow"
-    private = True 
+    private = True
     refresh_cache = False
-    full_precision = True 
+    full_precision = True
 
     data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
     print(f"Downloading and processing peft model: {peft_model_id}")
     peft = ff.PEFT(
-                peft_model_id,
-                data_type=data_type,
-                cache_path=args.cache_folder,
-                refresh_cache=args.refresh_cache,
-            )
+        peft_model_id,
+        data_type=data_type,
+        cache_path=args.cache_folder,
+        refresh_cache=args.refresh_cache,
+    )
     peft.download_hf_weights_if_needed()
     peft.download_hf_config()
 
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 2d61461421..c9b7a729e6 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -403,67 +403,57 @@ def __load_hf_weights(self):
         )
 
         self.fileloader.load_weights(self.model.ffmodel, self.data_type)
-        
+
     def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
         """
-<<<<<<< HEAD
         Uploads the model to the Hugging Face Hub, with reverse conversion of weights.
-=======
-        Uploads the model weights to the Hugging Face Hub, with reverse conversion of weights.
->>>>>>> cdf24eb0 (modify upload logic and add reconvert functions for opt models)
-        
+
         :param new_model_id: The new model ID for the Hugging Face Hub.
         :param model_path: The path where the FlexFlow weights are stored.
         :param private: Whether to upload the model as a private model.
         """
         print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
-<<<<<<< HEAD
         print("tokenizer path is: ", self.tokenizer_path)
-=======
->>>>>>> cdf24eb0 (modify upload logic and add reconvert functions for opt models)
-        
+
         # Initialize a new Hugging Face model instance
         hf_model = AutoModelForCausalLM.from_config(self.hf_config)
         weights_path = self.weights_path
-        
+
         # Load FlexFlow weights into the Hugging Face model instance
         try:
             self.model_class.load_weights_into_hf_model(hf_model, weights_path)
         except Exception as e:
             print(f"Error loading weights into model: {e}")
             return
-        
+
         # Save the model with converted weights to a temporary directory
         temp_dir = tempfile.mkdtemp()
         hf_model.save_pretrained(temp_dir)
-        
-<<<<<<< HEAD
+
         # Copy the tokenizer files to the temporary directory
         tokenizer_files = [f for f in os.listdir(self.tokenizer_path)]
         for file_name in tokenizer_files:
             shutil.copy(os.path.join(self.tokenizer_path, file_name), temp_dir)
-            
+
         # Delete rev_sha.txt from the temporary directory if it exists
         rev_sha_path = os.path.join(temp_dir, 'rev_sha.txt')
         if os.path.exists(rev_sha_path):
             os.remove(rev_sha_path)
-            
-=======
->>>>>>> cdf24eb0 (modify upload logic and add reconvert functions for opt models)
+
         # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")
             return
-        
+
         # Upload the model
         api = HfApi()
         print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
         api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
         api.upload_folder(folder_path=temp_dir, repo_id=new_model_id)
-        
+
         # Cleanup temporary directory
         shutil.rmtree(temp_dir)
-        
+
         print("Upload completed successfully.")
 
     def compile(
@@ -872,113 +862,34 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         :param private: Whether to upload the model as a private model.
         """
         print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
-        
-        # Initialize a new Hugging Face model instance
-        hf_model = AutoModelForCausalLM.from_config(self.hf_config)
-        weights_path = self.weights_path
-        
-        # Load FlexFlow weights into the Hugging Face model instance
-        try:
-            self.model_class.load_weights_into_hf_model(hf_model, weights_path)
-        except Exception as e:
-            print(f"Error loading weights into model: {e}")
-            return
-        
-        # Save the model with converted weights to a temporary directory
-        temp_dir = tempfile.mkdtemp()
-        hf_model.save_pretrained(temp_dir)
-        
-        # Ensure Hugging Face CLI is logged in
-        if not HfFolder.get_token():
-            print("Hugging Face token not found. Please login using `huggingface-cli login`.")
-            return
-        
-        # Upload the model
-        api = HfApi()
-        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
-        api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
-        api.upload_folder(folder_path=temp_dir, repo_id=new_model_id)
-        
-        # Cleanup temporary directory
-        shutil.rmtree(temp_dir)
-        
-        print("Upload completed successfully.")
 
-    def process_and_upload_hf_model(self, model_id: str, private: bool = False):
-        """
-        Processes the PEFT model and uploads it to the Hugging Face Hub.
-
-        Args:
-        - model_id (str): The desired model ID on the Hugging Face Hub (e.g., "username/model_name").
-        - private (bool): If True, the model will be uploaded as a private model.
-        """
-<<<<<<< HEAD
-        try:
-            # Check for Hugging Face CLI authentication
-            if not HfFolder.get_token():
-                raise ValueError("Hugging Face token not found. Please log in using `huggingface-cli login`.")
-            
-            # Ensure the specified directory contains model files
-            if not os.listdir(model_directory):
-                raise FileNotFoundError(f"No files found in {model_directory}. Please check the path and try again.")
-
-            # Create or get the repository
-            repo_url = HfApi().create_repo(name=model_id, private=private, exist_ok=True, use_auth_token=True)
-            print(f"Repository URL: {repo_url}")
-
-            # Initialize the repository, add files, commit, and push
-            repo = Repository(local_dir=model_directory, clone_from=repo_url, use_auth_token=True)
-            repo.git_add()
-            repo.git_commit("Upload model to Hugging Face Hub")
-            repo.git_push()
-
-            print(f"Model '{model_id}' successfully uploaded to the Hugging Face Hub.")
-        except Exception as e:
-            print(f"Failed to upload the model: {e}")
-        
-=======
-        self.download_hf_weights_if_needed()
-        model_directory = self.weights_path 
-        self.upload_model_to_hf(model_directory, model_id, private)
-
-    def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
-        """
-        Uploads the PEFT model to the Hugging Face Hub, with reverse conversion of weights.
-        
-        :param new_model_id: The new model ID for the Hugging Face Hub.
-        :param model_path: The path where the FlexFlow weights are stored.
-        :param private: Whether to upload the model as a private model.
-        """
-        print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
-        
         # Initialize a new Hugging Face model instance
         hf_model = AutoModelForCausalLM.from_config(self.hf_config)
         weights_path = self.weights_path
-        
+
         # Load FlexFlow weights into the Hugging Face model instance
         try:
             self.model_class.load_weights_into_hf_model(hf_model, weights_path)
         except Exception as e:
             print(f"Error loading weights into model: {e}")
             return
-        
+
         # Save the model with converted weights to a temporary directory
         temp_dir = tempfile.mkdtemp()
         hf_model.save_pretrained(temp_dir)
-        
+
         # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")
             return
-        
+
         # Upload the model
         api = HfApi()
         print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
         api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
         api.upload_folder(folder_path=temp_dir, repo_id=new_model_id)
-        
+
         # Cleanup temporary directory
         shutil.rmtree(temp_dir)
-        
-        print("Upload completed successfully.")
->>>>>>> 4b760ac8 (refactor uploading peft)
+
+        print("Upload completed successfully.")
\ No newline at end of file

From c7e242929c791d50e978871215bf83c119979041 Mon Sep 17 00:00:00 2001
From: April Yang <aprilytyang@gmail.com>
Date: Wed, 3 Apr 2024 18:48:56 +0000
Subject: [PATCH 41/55] fix issues for opt model conversion

---
 python/flexflow/serve/models/opt.py | 48 ++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 982ac156d1..3f2db66c01 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -306,12 +306,13 @@ def convert_ff_weight_name(name):
         # Reverse the previous conversion rules
         converted_name = (
             name
-            .replace("add_bias_residual_layer_norm_attn_bias", "attention_wo_bias")
-            .replace("_add_bias_residual_layer_norm", "_final_layer_norm")
+            .replace("add_bias_residual_layer_norm.attn_bias", "attention_wo_bias")
+            .replace(".add_bias_residual_layer_norm", ".final_layer_norm")
             .replace("wq", "q_proj")
             .replace("wk", "k_proj")
             .replace("wv", "v_proj")
             .replace("wo", "out_proj")
+            .replace("self_attn.o_proj", "self_attn.out_proj")
             .replace("attention", "self_attn")
         )
         
@@ -321,7 +322,8 @@ def convert_ff_weight_name(name):
         converted_name = converted_name.replace("embed_tokens_weight_lm_head", "embed_tokens.weight")
         
         # Prepend "model.decoder." to the weight name
-        converted_name = "model.decoder." + converted_name
+        if not converted_name.startswith("model.decoder.") and "lm_head" not in converted_name:
+            converted_name = "model.decoder." + converted_name
         
         return converted_name
 
@@ -330,33 +332,51 @@ def load_weights_into_hf_model(model, src_folder):
         """
         Load weights from a specified folder and apply them to a Hugging Face model.
         
+        This function iterates through the weight files in the specified folder, 
+        converts the FlexFlow weight names to Hugging Face format, and loads the 
+        weights into the Hugging Face model. It handles special cases like shape 
+        mismatches by adjusting the weights accordingly.
+        
         Parameters:
         - model: The instance of the Hugging Face model to load the weights into.
         - src_folder: The path to the folder containing the weight files.
         """
+        
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
-            print("converting weight name: ", weight_path)
+            print("Converting weight name:", weight_path)
+            
             if weight_path.endswith("rev_sha.txt"):
-                print("skipping rev_sha.txt")
+                print("Skipping rev_sha.txt")
                 continue
-            else:
-                original_name = FlexFlowOPT.convert_ff_weight_name(file_name.replace('.bin', ''))
-                print("original name of the weights is: ", original_name)
-            
+
+            original_name = FlexFlowOPT.convert_ff_weight_name(file_name.replace('.bin', ''))
+            print("Original name of the weights is:", original_name)
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
             
-            # weight_data = np.fromfile(weight_path, dtype=np.float32)
             weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
             if original_name not in model.state_dict():
                 raise KeyError(f"Parameter {original_name} not found in model.")
             param = model.state_dict()[original_name]
             
-            if weight_data.size != param.numel():
-                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+            # Calculate the reshape size automatically based on expected parameter size
+            expected_numel = param.numel()
+            if weight_data.size != expected_numel:
+                print(f"Adjusting shape for {original_name} from {weight_data.size} to {expected_numel}")
+                # Check if weight_data can be evenly divided by expected_numel
+                if weight_data.size % expected_numel == 0:
+                    # Determine the reshape size
+                    factor = weight_data.size // expected_numel
+                    # Assume the extra dimension is at the first dimension (e.g., for embedding matrices)
+                    new_shape = (factor, ) + tuple(param.shape)
+                    weight_data_reshaped = weight_data.reshape(new_shape)
+                    # Use only the first part of the reshaped data if it matches the expected size
+                    weight_tensor = torch.from_numpy(weight_data_reshaped[0])
+                else:
+                    raise ValueError(f"Cannot adjust shape for {original_name} due to incompatible size.")
+            else:
+                weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
             
-            weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
             with torch.no_grad():
-                # Update the model's state dict directly since param.copy_ doesn't work on tensor slices or elements not in place
                 model.state_dict()[original_name].copy_(weight_tensor)
\ No newline at end of file

From e3be6b25787be50b5762fb3a92ac74ab236a8dc5 Mon Sep 17 00:00:00 2001
From: April Yang <aprilytyang@gmail.com>
Date: Thu, 4 Apr 2024 19:20:26 +0000
Subject: [PATCH 42/55] fix issues for llama models

q
---
 python/flexflow/serve/models/llama.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 9e32fffdfa..a5557ed467 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -271,7 +271,7 @@ def convert_ff_weight_name(name):
             .replace("wv", "v_proj")
             .replace("wo", "o_proj")
             .replace("feed_forward_", "mlp.")
-            .replace("self_attn", "attention")
+            .replace("post_self_attn", "post_attention")
             .replace("attention_norm", "input_layernorm")
             .replace("tok_embeddings", "embed_tokens")
             .replace("output", "lm_head")
@@ -280,7 +280,7 @@ def convert_ff_weight_name(name):
         
         converted_name = re.sub(r"layers_(\d+)_", r"layers.\1.", converted_name)
         converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
-        converted_name = re.sub(r"attention_(?!norm)", "self_attn.", converted_name)
+        # converted_name = re.sub(r"attention_(?!norm)", "self_attn.", converted_name)
         
         converted_name = converted_name.replace("ffn_norm", "post_attention_layernorm")
             
@@ -289,7 +289,6 @@ def convert_ff_weight_name(name):
                  
         return converted_name
     
-    
     def load_weights_into_hf_model(model, src_folder):
         """
         Load weights from a specified folder and apply them to a Hugging Face model.
@@ -313,11 +312,23 @@ def load_weights_into_hf_model(model, src_folder):
             weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
             if original_name not in model.state_dict():
                 raise KeyError(f"Parameter {original_name} not found in model.")
-            param = model.state_dict()[original_name]
             
-            if weight_data.size != param.numel():
-                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+            param = model.state_dict()[original_name]
+            expected_numel = param.numel()
+            if weight_data.size != expected_numel:
+                print(f"Adjusting shape for {original_name} from {weight_data.size} to {expected_numel}.")
+                if weight_data.size % expected_numel == 0:
+                    # If the weight data is an exact multiple of the expected size,
+                    # it's likely that the data includes redundant dimensions.
+                    # We'll reshape it by keeping only the first segment that matches the expected shape.
+                    factor = weight_data.size // expected_numel
+                    new_shape = (factor,) + tuple(param.shape)
+                    weight_data_reshaped = weight_data.reshape(new_shape)[0]  # Keep only the first segment
+                    weight_tensor = torch.from_numpy(weight_data_reshaped)
+                else:
+                    raise ValueError(f"Cannot adjust shape for {original_name} due to incompatible size.")
+            else:
+                weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
             
-            weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
             with torch.no_grad():
-                model.state_dict()[original_name].copy_(weight_tensor)
+                param.copy_(weight_tensor)
\ No newline at end of file

From ee41f3a43ba545e906f54969ab4929c33736b197 Mon Sep 17 00:00:00 2001
From: April Yang <aprilytyang@gmail.com>
Date: Thu, 4 Apr 2024 21:21:14 +0000
Subject: [PATCH 43/55] models/starcoder.py

---
 python/flexflow/serve/models/llama.py     |  5 +---
 python/flexflow/serve/models/mpt.py       |  2 +-
 python/flexflow/serve/models/opt.py       |  2 --
 python/flexflow/serve/models/starcoder.py | 31 +++++++++++++----------
 python/flexflow/serve/serve.py            |  1 +
 5 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index a5557ed467..71818a7984 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -318,12 +318,9 @@ def load_weights_into_hf_model(model, src_folder):
             if weight_data.size != expected_numel:
                 print(f"Adjusting shape for {original_name} from {weight_data.size} to {expected_numel}.")
                 if weight_data.size % expected_numel == 0:
-                    # If the weight data is an exact multiple of the expected size,
-                    # it's likely that the data includes redundant dimensions.
-                    # We'll reshape it by keeping only the first segment that matches the expected shape.
                     factor = weight_data.size // expected_numel
                     new_shape = (factor,) + tuple(param.shape)
-                    weight_data_reshaped = weight_data.reshape(new_shape)[0]  # Keep only the first segment
+                    weight_data_reshaped = weight_data.reshape(new_shape)[0] 
                     weight_tensor = torch.from_numpy(weight_data_reshaped)
                 else:
                     raise ValueError(f"Cannot adjust shape for {original_name} due to incompatible size.")
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index fd81ae6d19..e67ccc42d4 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -295,7 +295,7 @@ def convert_ff_weight_name(name):
             
         converted_name = converted_name.replace("attention_wo", "attn.out_proj")
         converted_name = converted_name.replace("ffn_", "ffn.")
-        converted_name = re.sub(r"layers_(\d+)_", r"transformer.blocks.\1.", converted_name)
+        converted_name = re.sub(r"layers.(\d+).", r"transformer.blocks.\1.", converted_name)
         converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
         
         return converted_name
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 3f2db66c01..76a78bb466 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -368,10 +368,8 @@ def load_weights_into_hf_model(model, src_folder):
                 if weight_data.size % expected_numel == 0:
                     # Determine the reshape size
                     factor = weight_data.size // expected_numel
-                    # Assume the extra dimension is at the first dimension (e.g., for embedding matrices)
                     new_shape = (factor, ) + tuple(param.shape)
                     weight_data_reshaped = weight_data.reshape(new_shape)
-                    # Use only the first part of the reshaped data if it matches the expected size
                     weight_tensor = torch.from_numpy(weight_data_reshaped[0])
                 else:
                     raise ValueError(f"Cannot adjust shape for {original_name} due to incompatible size.")
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 11dfc1744e..f55406ee91 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -272,13 +272,12 @@ def convert_ff_weight_name(name):
         Convert weight names from FlexFlow format back to Hugging Face format.
         """
         converted_name = name
-        # Example conversion logic, adjust as needed
-        if "attention_wo" in name:
-            converted_name = converted_name.replace("attention_wo", "attn.c_proj")
-            
-        converted_name = converted_name.replace("mlp_", "mlp.").replace("_ln_f", ".ln_f").replace("_wpe", ".wpe").replace("_wte", ".wte")
+        converted_name = converted_name.replace("attn.c_attn.o_proj", "attn.c_proj")
         
-        converted_name = re.sub(r"layers_(\d+)_", r"transformer.h.\1.", converted_name)
+        converted_name = converted_name.replace("mlp_", "mlp.").replace("_ln_f", ".ln_f").replace("_wpe", ".wpe").replace("_wte", ".wte")
+        if ("ln_f" in converted_name) or ("wpe" in converted_name) or ("wte" in converted_name):
+            converted_name = "transformer"+converted_name
+        converted_name = re.sub(r"layers.(\d+).", r"transformer.h.\1.", converted_name)
         converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
         
 
@@ -326,38 +325,42 @@ def load_weights_into_hf_model(model, src_folder):
             print(f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}")
             
             # Special handling for combined QKV weights
-            if ("attention_wq" in original_name) or ("attention_wk" in original_name) or ("attention_wv" in original_name):
+            if ("q_proj" in original_name) or ("k_proj" in original_name) or ("v_proj" in original_name):
                 weight_bias = ".weight" if ".weight" in original_name else ".bias"
-                layer_num_match = re.search(r"layers\_(\d+)", file_name)
+                layer_num_match = re.search(r"layers.(\d+)", file_name)
                 layer_num = int(layer_num_match.group(1)) if layer_num_match else None
+                print(f"layer_num is {layer_num}")
                 qkv_type = file_name.split("_")[-2]
                 qkv_name = f"transformer.h.{layer_num}.attn.c_attn" + weight_bias
                 
                 if layer_num is not None:
                     # initialize qkv layer in dict
                     if qkv_name not in qkv_weights:
-                        qkv_weights[qkv_name] = {'wq': None, 'wk': None, 'wv': None}
+                        qkv_weights[qkv_name] = {'attn.q': None, 'attn.k': None, 'attn.v': None}
                         print(f"Initialized QKV layer {layer_num}")
                     # assign weights into dict
                     qkv_weights[qkv_name][qkv_type] = weight_data
-                    print(f"attached qkv weight {qkv_name}")
+                    print(f"attached qkv weight {qkv_name} for type {qkv_type}, weight data dimension is {weight_data.shape}")
                 
                 continue
-
-
+            
+            # Handling for other parameters
             # for weights that are not q,k,v, get the param names and assign weights accordingly
             param = model.state_dict().get(original_name, None)
+            print(f"Param name: {original_name}")
             if weight_data.size != param.numel():
                 raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
             
             weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+            print(f"shape of the weight tensor is: {weight_tensor.shape}")
             with torch.no_grad():
                 model.state_dict()[original_name].copy_(weight_tensor)
-                print(f"Assigned weight {original_name} successfully!")
+                print(f"Assigned weight {original_name} successfully!\n")
                 
                 
         for qkv_name, weights_dict in qkv_weights.items():
-            combined_qkv = np.concatenate([qkv_weights[qkv_name]['wq'], qkv_weights[qkv_name]['wk'], qkv_weights[qkv_name]['wv']], axis=0)
+            print(f"qkv name is {qkv_name}, with weight {weights_dict}")
+            combined_qkv = np.concatenate([qkv_weights[qkv_name]['attn.q'], qkv_weights[qkv_name]['attn.k'], qkv_weights[qkv_name]['attn.v']], axis=0)
             param_shape = model.state_dict()[qkv_name].shape
             combined_qkv_reshaped = combined_qkv.reshape(param_shape)
             print(f"reshaped qkv weights shape is: {combined_qkv_reshaped.shape}")
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index c9b7a729e6..13917c56d4 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -418,6 +418,7 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         # Initialize a new Hugging Face model instance
         hf_model = AutoModelForCausalLM.from_config(self.hf_config)
         weights_path = self.weights_path
+        print(f"model class is: {self.model_class}")
 
         # Load FlexFlow weights into the Hugging Face model instance
         try:

From 8efb92bd7650dbe2219c0dbf46bbc936c40ab626 Mon Sep 17 00:00:00 2001
From: April Yang <aprilytyang@gmail.com>
Date: Fri, 5 Apr 2024 00:23:01 +0000
Subject: [PATCH 44/55] fix issues for mpt models

---
 python/flexflow/serve/models/mpt.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index e67ccc42d4..9d4d0f31bb 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -250,7 +250,6 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
-    # TODO: finish this
     def convert_hf_weight_name(name):
         return (
             name.replace("transformer.blocks.", "layers.")
@@ -293,11 +292,14 @@ def convert_ff_weight_name(name):
         if "norm_f" in converted_name or "wte" in converted_name:
             converted_name = converted_name.replace("_", ".").replace("norm.f", "norm_f")
             
-        converted_name = converted_name.replace("attention_wo", "attn.out_proj")
+        converted_name = converted_name.replace("attn.o_proj", "attn.out_proj")
         converted_name = converted_name.replace("ffn_", "ffn.")
         converted_name = re.sub(r"layers.(\d+).", r"transformer.blocks.\1.", converted_name)
         converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
         
+        if ("wte" in converted_name) or ("norm_f" in converted_name):
+            converted_name = "transformer." + converted_name
+        
         return converted_name
 
     def load_weights_into_hf_model(model, src_folder):
@@ -320,7 +322,6 @@ def load_weights_into_hf_model(model, src_folder):
                 print("skipping rev_sha.txt")
                 continue
             elif "lm_head" in weight_path:
-                # todo: double check how to handle lm_head in uploading mpt models
                 print("skipping lm_head.weight")
                 continue
             else:
@@ -331,9 +332,10 @@ def load_weights_into_hf_model(model, src_folder):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
             
             weight_data = np.fromfile(weight_path, dtype=np.float32)
+            print(f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}")
             
             # Special handling for combined QKV weights
-            if ("wq" in file_name) or ("wk" in file_name) or ("wv" in file_name):
+            if ("q_proj" in file_name) or ("k_proj" in file_name) or ("v_proj" in file_name):
                 layer_num_match = re.search(r"layers\.(\d+)", original_name)
                 layer_num = int(layer_num_match.group(1)) if layer_num_match else None
                 qkv_type = original_name.split("_")[-2]

From 41e0bee19dff29eaaa091bb213212b82c0ec4e43 Mon Sep 17 00:00:00 2001
From: April Yang <aprilytyang@gmail.com>
Date: Fri, 5 Apr 2024 10:00:23 +0000
Subject: [PATCH 45/55] some fixes

---
 inference/python/ff_peft.py             |   7 +-
 inference/utils/download_upload_hf.py   |  27 +-----
 inference/utils/download_upload_peft.py |  29 ------
 python/flexflow/serve/models/falcon.py  | 112 ++++++++++++------------
 4 files changed, 64 insertions(+), 111 deletions(-)

diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index 38a25fb614..ccbab41356 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -137,8 +137,13 @@ def main():
                 dataset_filepath=configs.finetuning_dataset,
             )
             requests.append(finetuning_request)
-
+            
+    # use the (finetuned) llm to generate some responses
     llm.generate(requests)
+    
+    # upload the model back to huggingface after finetuning
+    # the model format would be converted from flexflow format back to huggingface format
+    llm.upload_hf_model(peft_model_id, cache_folder, private=private)
 
     llm.stop_server()
 
diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index 0432dd2cd9..f75df7290d 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -2,7 +2,9 @@
 import argparse
 from huggingface_hub import HfApi, HfFolder
 import flexflow.serve as ff
+import warnings
 
+warnings.filterwarnings("ignore")
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
@@ -15,30 +17,9 @@ def parse_args():
     return parser.parse_args()
 
 
-def download_and_process_model(model_name, cache_folder, refresh_cache, full_precision):
-    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
-    print(f"Downloading and processing model: {model_name}")
-    llm = ff.LLM(
-        model_name=model_name,
-        data_type=data_type,
-        cache_path=cache_folder,
-        refresh_cache=refresh_cache,
-    )
-    llm.download_hf_weights_if_needed()
-    llm.download_hf_tokenizer_if_needed()
-    llm.download_hf_config()
-    return llm
-
-
-def upload_processed_model_to_hub(llm, new_model_id, cache_folder, private):
-    print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
-    llm.upload_hf_model(new_model_id, cache_folder, private=private)
-    print("Upload completed successfully.")
-
-
 def main():
-    model_name = "meta-llama/Llama-2-7b"
-    new_model_id = "your_username/new-model-name"
+    model_name = "tiiuae/falcon-7b"
+    new_model_id = "aprilyyt/falcon-upload-test-new"
     cache_folder = "~/.cache/flexflow"
     private = True
     refresh_cache = False
diff --git a/inference/utils/download_upload_peft.py b/inference/utils/download_upload_peft.py
index 5faf87f359..c918c324c6 100644
--- a/inference/utils/download_upload_peft.py
+++ b/inference/utils/download_upload_peft.py
@@ -13,35 +13,6 @@ def parse_args():
     parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights for the PEFT model.")
     return parser.parse_args()
 
-def download_and_process_peft_model(peft_model_id, cache_folder, refresh_cache, full_precision):
-    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
-    print(f"Downloading and processing PEFT model: {peft_model_id}")
-    peft = ff.PEFT(
-        peft_model_id=peft_model_id,
-        data_type=data_type,
-        cache_path=cache_folder,
-        refresh_cache=refresh_cache,
-    )
-    peft.download_hf_weights_if_needed()
-    peft.download_hf_config()
-    # any necessary conversion or processing by FlexFlow happens here
-    return peft
-
-
-def upload_peft_model_to_hub(peft, new_model_id, cache_folder, private):
-    print(f"Uploading peft model to HuggingFace Hub: {new_model_id}")
-    peft.upload_hf_model(new_model_id, cache_folder, private=private)
-    print("Upload completed successfully.")
-
-
-# def main():
-#     args = parse_args()
-#     peft = download_and_process_peft_model(args.peft_model_id, args.cache_folder, args.refresh_cache, args.full_precision)
-#     upload_peft_model_to_hub(peft, args.new_model_id, args.cache_folder, args.private)
-
-# if __name__ == "__main__":
-#     main()
-
 
 def main():
     model_name = "meta-llama/Llama-2-7b"
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 90d7fcfb52..e84d7704f9 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -239,7 +239,6 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
-    # TODO: finish this
     def convert_hf_weight_name(name):
         return (name.replace("transformer.h.", "layers.")
             .replace("transformer.", "")
@@ -278,17 +277,14 @@ def convert_hf_model(model, dst_folder):
         model.lm_head.weight.detach().cpu().numpy().tofile(
             os.path.join(dst_folder, "lm_head.weight")
         )
-
     
     def convert_ff_weight_name(name):
         
         converted_name = name
-        converted_name = converted_name.replace("mlp_dense_h_to_4h", "mlp.dense_h_to_4h")
-        converted_name = converted_name.replace("mlp_dense_4h_to_h", "mlp.dense_4h_to_h")
-        converted_name = converted_name.replace("attention_wo", "self_attention.dense")
+        converted_name = converted_name.replace("self_attention.o_proj", "self_attention.dense")
         if name.startswith("ln") or name.startswith("word_embeddings"):
             converted_name = "transformer." + converted_name
-        converted_name = re.sub(r"layers_(\d+)_", r"transformer.h.\1.", converted_name)
+        converted_name = re.sub(r"layers.(\d+).", r"transformer.h.\1.", converted_name)
         converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
 
         return converted_name
@@ -304,6 +300,8 @@ def load_weights_into_hf_model(model, src_folder):
         - config: The configuration object for the model.
         """
         
+        print(f"loading weights from {model} into {src_folder}")
+        
         hidden_size = model.config.hidden_size
         n_head = (
             model.config.n_head
@@ -314,9 +312,9 @@ def load_weights_into_hf_model(model, src_folder):
         print("Model hidden size:", hidden_size)
         print("Model num_attention_heads:", n_head)
         
-        num_attention_heads = n_head
-        hidden_size_per_head = hidden_size // n_head
-        intermediate_size = hidden_size * 4
+        # num_attention_heads = n_head
+        # hidden_size_per_head = hidden_size // n_head
+        # intermediate_size = hidden_size * 4
         
         qkv_weights = {}
         
@@ -328,7 +326,7 @@ def load_weights_into_hf_model(model, src_folder):
                 continue
             else:
                 original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
-                print("Converted weight name:", original_name)
+                print(f"Converted weight name from {file_name} to {original_name}")
             
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
@@ -337,62 +335,60 @@ def load_weights_into_hf_model(model, src_folder):
             print(f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}")
             
             # for q,k,v weights, store in dict
-            if ("attention_wq" in original_name) or ("attention_wk" in original_name) or ("attention_wv" in original_name):
-                qkv_match = re.search("(wq|wk|wv)", file_name)
-                qkv_type = qkv_match.group(0) if qkv_match else None
+            if ("q_proj" in original_name) or ("k_proj" in original_name) or ("v_proj" in original_name):
+                
                 layer_num_match = re.search(r"transformer.h.(\d+)", original_name)
                 layer_num = int(layer_num_match.group(1)) if layer_num_match else None
-                print(f"QKV type: {qkv_type}, Layer number: {layer_num}")
+                qkv_type = file_name.split(".")[-2]
+                print(f"qkv type for this weight is {qkv_type}")
                 
                 if layer_num is not None:
-                    if layer_num not in qkv_weights:
-                        
-                        qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
-                        if qkv_name in model.state_dict():
-                            qkv_param_size = model.state_dict()[qkv_name].shape[0]
-                        qkv_shape = (qkv_param_size, hidden_size)
-                        qkv_weights[layer_num] = np.zeros(qkv_shape)
-                        print(f"Initialized QKV shape for layer {layer_num}: {qkv_shape}")
-                        
-                    type_index = {"wq": 0, "wk": 1, "wv": 2}.get(qkv_type, 0)
-                    ## dim 0 sizes: 
-                    dim_wq = hidden_size
-                    dim_wk = hidden_size // n_head
-                    dim_wv = hidden_size // n_head
+                    qkv_key = f"transformer.h.{layer_num}.self_attention.query_key_value"
+                    if qkv_key not in qkv_weights:
+                        qkv_weights[qkv_key] = {'q_proj': None, 'k_proj': None, 'v_proj': None}
                     
-                    try:
-                        expected_shape = (weight_data.size // hidden_size, hidden_size)
-                        reshaped_data = weight_data.reshape(expected_shape)
-                        print(f"Reshaped QKV weights for {qkv_type} in layer {layer_num} with shape {expected_shape}.")
-                    except ValueError as e:
-                        print(f"Error reshaping {qkv_type} weights for layer {layer_num}: {e}")
-                        print(f"Attempting to reshape data of size {weight_data.size} into shape (-1, {hidden_size})")
-                        
-                    try:
-                        if qkv_type == "wq":
-                            qkv_weights[layer_num][0:dim_wq, :] = reshaped_data
-                        elif qkv_type == "wk":
-                            qkv_weights[layer_num][dim_wq:dim_wk+dim_wq, :] = reshaped_data
-                        else:
-                            qkv_weights[layer_num][dim_wq+dim_wk:, :] = reshaped_data
-                    except ValueError as e:
-                        print(f"Error assigning {qkv_type} weights for layer {layer_num}: {e}")
+                    qkv_weights[qkv_key][qkv_type] = weight_data
                 continue
             
-            # for weights that are not q,k,v, get the param names and assign weights accordingly
-            param = model.state_dict().get(original_name, None)
+            # Handle non-QKV weights normally
+            param = model.state_dict()[original_name]
+            expected_numel = param.numel()
+            print(f"expected param shape is {expected_numel}")
             if param is None:
-                print(f"Warning: {original_name} not found in directory")
-            reshaped_data = weight_data.reshape(param.shape)
-            param.data.copy_(torch.from_numpy(reshaped_data))
+                # raise ValueError(f"Warning: {original_name} not found!")
+                print(f"Warning: {original_name} not found!")
+                continue
             
-        # Assign the combined QKV weights to the model
-        for layer_num, weight in qkv_weights.items():
-            qkv_name = f"transformer.h.{layer_num}.self_attention.query_key_value.weight"
-            if qkv_name in model.state_dict():
-                param = model.state_dict()[qkv_name]
-                # Ensure the combined weight is correctly reshaped to fit the model's expectations
-                param.data.copy_(torch.from_numpy(weight.reshape(param.shape)))
+            if weight_data.size != param.numel():
+                # print(f"shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
+                expected_shape = param.shape
+                if weight_data.size % param.numel() == 0:
+                    factor = weight_data.size // np.prod(expected_shape)
+                    new_shape = (factor,) + tuple(expected_shape)
+                    weight_data_reshaped = weight_data.reshape(new_shape)[0]
+                    weight_tensor = torch.from_numpy(weight_data_reshaped)
+                else:
+                    raise ValueError(f"Shape mismatch and cannot convert for {original_name}")
+            else:
+                weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
+        
+            print(f"shape of the weight tensor is: {weight_tensor.shape}")
+            with torch.no_grad():
+                model.state_dict()[original_name].copy_(weight_tensor)
+                print(f"Assigned weight {original_name} successfully!\n")
+                
+        # Assign combined QKV weights
+        for qkv_name, weights_dict in qkv_weights.items():
+            print(f"qkv name is {qkv_name}")
+            print(f"the weights dimensions are: {weights_dict['q_proj'].shape}, {weights_dict['k_proj'].shape}, {weights_dict['v_proj'].shape}")
+            combined_qkv = np.concatenate([weights_dict['q_proj'], weights_dict['k_proj'], weights_dict['v_proj']], axis=0)
+            qkv_weight_name = qkv_name+".weight"
+            param_shape = model.state_dict()[qkv_weight_name].shape
+            print(f"param shape expected to be {param_shape}, qkv weights combined with weights size {combined_qkv.shape}")
+            combined_qkv_reshaped = combined_qkv.reshape(param_shape)
+            print(f"reshaped qkv weights shape is: {combined_qkv_reshaped.shape}")
+            model.state_dict()[qkv_weight_name].copy_(torch.from_numpy(combined_qkv_reshaped))
+            print(f"Assigned combined QKV weights to {qkv_weight_name}.")
             
             
-            
\ No newline at end of file
+    
\ No newline at end of file

From 3354630ecdd3fc9d3dbd1326c40ce46e6820774d Mon Sep 17 00:00:00 2001
From: april-yyt <aprilytyang@gmail.com>
Date: Wed, 10 Apr 2024 01:29:43 +0000
Subject: [PATCH 46/55] some fixes for falcon, qkv weights issues remains

---
 python/flexflow/serve/models/falcon.py | 37 +++++++++++++++++++-------
 python/flexflow/serve/serve.py         |  4 +--
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index e84d7704f9..c56be9de61 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -379,16 +379,35 @@ def load_weights_into_hf_model(model, src_folder):
                 
         # Assign combined QKV weights
         for qkv_name, weights_dict in qkv_weights.items():
-            print(f"qkv name is {qkv_name}")
+            print("\n========= Processing combined QKV weights ==========")
+            print(f"qkv name is {qkv_name}, hidden size is {hidden_size}, number of attention heads is {n_head}")
             print(f"the weights dimensions are: {weights_dict['q_proj'].shape}, {weights_dict['k_proj'].shape}, {weights_dict['v_proj'].shape}")
-            combined_qkv = np.concatenate([weights_dict['q_proj'], weights_dict['k_proj'], weights_dict['v_proj']], axis=0)
-            qkv_weight_name = qkv_name+".weight"
+
+            q_proj_weight = weights_dict['q_proj']
+            k_proj_weight = weights_dict['k_proj']
+            v_proj_weight = weights_dict['v_proj']
+            
+            print("Original QKV weights dimensions:")
+            print("Q:", q_proj_weight.shape)
+            print("K:", k_proj_weight.shape)
+            print("V:", v_proj_weight.shape)
+
+            # Reshape the weights to match the expected shape
+            q_proj_weight_reshaped = q_proj_weight.reshape(-1, hidden_size)
+            k_proj_weight_reshaped = k_proj_weight.reshape(-1, hidden_size // n_head) 
+            v_proj_weight_reshaped = v_proj_weight.reshape(-1, hidden_size // n_head)
+            # q_proj_weight_reshaped = q_proj_weight.reshape(k_proj_weight_reshaped.shape[0], -1)
+
+            print("Reshaped QKV weights dimensions:")
+            print("Q:", q_proj_weight_reshaped.shape)
+            print("K:", k_proj_weight_reshaped.shape)
+            print("V:", v_proj_weight_reshaped.shape)
+
+            combined_qkv = np.concatenate([q_proj_weight_reshaped, k_proj_weight_reshaped, v_proj_weight_reshaped], axis=1)
+            qkv_weight_name = qkv_name + ".weight"
             param_shape = model.state_dict()[qkv_weight_name].shape
             print(f"param shape expected to be {param_shape}, qkv weights combined with weights size {combined_qkv.shape}")
-            combined_qkv_reshaped = combined_qkv.reshape(param_shape)
-            print(f"reshaped qkv weights shape is: {combined_qkv_reshaped.shape}")
-            model.state_dict()[qkv_weight_name].copy_(torch.from_numpy(combined_qkv_reshaped))
+
+            model.state_dict()[qkv_weight_name].copy_(torch.from_numpy(combined_qkv))
             print(f"Assigned combined QKV weights to {qkv_weight_name}.")
-            
-            
-    
\ No newline at end of file
+            
\ No newline at end of file
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 13917c56d4..294ebac843 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -413,12 +413,12 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         :param private: Whether to upload the model as a private model.
         """
         print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
-        print("tokenizer path is: ", self.tokenizer_path)
+        print("Tokenizer path is: ", self.tokenizer_path)
 
         # Initialize a new Hugging Face model instance
         hf_model = AutoModelForCausalLM.from_config(self.hf_config)
         weights_path = self.weights_path
-        print(f"model class is: {self.model_class}")
+        print(f"Model class is: {self.model_class}")
 
         # Load FlexFlow weights into the Hugging Face model instance
         try:

From 89b6e56abfd21ac565d571beb9421de02023fc65 Mon Sep 17 00:00:00 2001
From: April Yang <aprilytyang@gmail.com>
Date: Wed, 10 Apr 2024 11:33:59 +0000
Subject: [PATCH 47/55] peft-upload-example

---
 inference/python/ff_peft.py           | 38 ++++++++++++++----
 inference/utils/download_upload_hf.py |  2 +-
 python/flexflow/serve/serve.py        | 57 ++++++++++++++++++++++++++-
 3 files changed, 87 insertions(+), 10 deletions(-)

diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index ccbab41356..f51c153c21 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -25,8 +25,22 @@ def get_configs():
         type=str,
         default="",
     )
-    args = parser.parse_args()
+    parser.add_argument(
+        "--publish-peft-with-id", 
+        help="The Hugging Face model ID to upload the trained model with",
+        type=str, 
+        default=""
+    )
 
+    args = parser.parse_args()
+    publish_peft_with_id = args.publish_peft_with_id
+    if len(publish_peft_with_id) == 0:
+        print(
+            "Please pass a --publish-peft-with-id if you want to upload the trained model"
+        )
+    else:
+        print(f"The trained model will be uploaded with id: {publish_peft_with_id}")
+        
     # Load configs from JSON file (if specified)
     if len(args.config_file) > 0:
         if not os.path.isfile(args.config_file):
@@ -68,17 +82,19 @@ def get_configs():
                 "goliaro/llama-160m-lora-full",
             ],
             # optional parameters
-            "cache_path": "",
+            "cache_path": "~/.cache/flexflow",
             "refresh_cache": False,
             "full_precision": False,
             "prompt": "",
             "finetuning_dataset": os.path.join(
                 os.path.dirname(os.path.abspath(__file__)), "../prompt/peft.json"
+                # peft.json is a sample dataset for finetuning, should contain a list of strings
             ),
-            "output_file": "",
+            "output_file": ""
         }
         # Merge dictionaries
         ff_init_configs.update(model_configs)
+        ff_init_configs["publish_peft_with_id"] = publish_peft_with_id
         return ff_init_configs
 
 
@@ -98,7 +114,7 @@ def main():
         data_type=ff_data_type,
         cache_path=configs.cache_path,
         refresh_cache=configs.refresh_cache,
-        output_file=configs.output_file,
+        output_file=configs.output_file
     )
     for peft_model_id in configs.peft_model_ids:
         llm.add_peft(peft_model_id)
@@ -115,6 +131,8 @@ def main():
     )
 
     llm.start_server()
+    
+    print(f"LLM model class is: {llm.model_class}")
 
     requests = []
     # Serving
@@ -141,12 +159,16 @@ def main():
     # use the (finetuned) llm to generate some responses
     llm.generate(requests)
     
+    llm.stop_server()
+    
     # upload the model back to huggingface after finetuning
     # the model format would be converted from flexflow format back to huggingface format
-    llm.upload_hf_model(peft_model_id, cache_folder, private=private)
-
-    llm.stop_server()
-
+    if len(configs.publish_peft_with_id) > 0:
+        print(
+            f"Done training! Uploading the model to HF hub with id: {configs.publish_peft_with_id}..."
+        )
+        llm.upload_peft_model(configs.publish_peft_with_id, private=True)
+    
 
 if __name__ == "__main__":
     print("flexflow PEFT example")
diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
index f75df7290d..28b70bf565 100644
--- a/inference/utils/download_upload_hf.py
+++ b/inference/utils/download_upload_hf.py
@@ -38,7 +38,7 @@ def main():
     llm.download_hf_config()
 
     print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
-    llm.upload_hf_model(new_model_id, cache_folder, private=private)
+    llm.upload_hf_model(new_model_id, private=private)
     print("Upload completed successfully.")
 
 if __name__ == "__main__":
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 294ebac843..06d8262360 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -404,7 +404,7 @@ def __load_hf_weights(self):
 
         self.fileloader.load_weights(self.model.ffmodel, self.data_type)
 
-    def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
+    def upload_hf_model(self, new_model_id: str, private: bool = False):
         """
         Uploads the model to the Hugging Face Hub, with reverse conversion of weights.
 
@@ -456,6 +456,61 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         shutil.rmtree(temp_dir)
 
         print("Upload completed successfully.")
+        
+        
+        
+    def upload_peft_model(self, new_model_id: str, private: bool = False):
+        """
+        Uploads the peft model to the Hugging Face Hub, with reverse conversion of weights.
+
+        :param new_model_id: The new model ID for the Hugging Face Hub.
+        :param model_path: The path where the FlexFlow weights are stored.
+        :param private: Whether to upload the model as a private model.
+        """
+        print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
+        print("Tokenizer path is: ", self.tokenizer_path)
+
+        # Initialize a new Hugging Face model instance
+        hf_model = AutoModelForCausalLM.from_config(self.hf_config)
+        weights_path = self.weights_path
+        print(f"Model class is: {self.model_class}")
+
+        # Load FlexFlow weights into the Hugging Face model instance
+        try:
+            self.model_class.load_weights_into_hf_model(hf_model, weights_path)
+        except Exception as e:
+            print(f"Error loading weights into model: {e}")
+            return
+
+        # Save the model with converted weights to a temporary directory
+        temp_dir = tempfile.mkdtemp()
+        hf_model.save_pretrained(temp_dir)
+
+        # Copy the tokenizer files to the temporary directory
+        tokenizer_files = [f for f in os.listdir(self.tokenizer_path)]
+        for file_name in tokenizer_files:
+            shutil.copy(os.path.join(self.tokenizer_path, file_name), temp_dir)
+
+        # Delete rev_sha.txt from the temporary directory if it exists
+        rev_sha_path = os.path.join(temp_dir, 'rev_sha.txt')
+        if os.path.exists(rev_sha_path):
+            os.remove(rev_sha_path)
+
+        # Ensure Hugging Face CLI is logged in
+        if not HfFolder.get_token():
+            print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+            return
+
+        # Upload the model
+        api = HfApi()
+        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
+        api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
+        api.upload_folder(folder_path=temp_dir, repo_id=new_model_id)
+
+        # Cleanup temporary directory
+        shutil.rmtree(temp_dir)
+
+        print("Upload completed successfully.")
 
     def compile(
         self,

From a2ab5bafe50af53867233bf7f06260a1f1f2f887 Mon Sep 17 00:00:00 2001
From: April Yang <aprilytyang@gmail.com>
Date: Wed, 10 Apr 2024 11:53:56 +0000
Subject: [PATCH 48/55] remove redundant code & metrics file

---
 inference/python/peft_metrics.py      | 273 ++++++++++++++++++++++++++
 inference/utils/download_upload_hf.py |  45 -----
 python/flexflow/serve/serve.py        |  28 ++-
 3 files changed, 292 insertions(+), 54 deletions(-)
 create mode 100644 inference/python/peft_metrics.py
 delete mode 100644 inference/utils/download_upload_hf.py

diff --git a/inference/python/peft_metrics.py b/inference/python/peft_metrics.py
new file mode 100644
index 0000000000..2d6d969b01
--- /dev/null
+++ b/inference/python/peft_metrics.py
@@ -0,0 +1,273 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flexflow.serve as ff
+import argparse, json, os
+from types import SimpleNamespace
+import time
+import subprocess
+import psutil
+import time
+import json
+
+
+def get_gpu_utilization():
+    try:
+        result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used', '--format=csv,noheader,nounits'], stdout=subprocess.PIPE)
+        output = result.stdout.decode('utf-8').strip()
+        lines = output.split('\n')
+        
+        total_gpu_utilization = 0.0
+        total_memory_used = 0.0
+        num_gpus = len(lines)
+        
+        for line in lines:
+            try:
+                gpu_utilization, memory_used = line.split(', ')
+                total_gpu_utilization += float(gpu_utilization)
+                total_memory_used += float(memory_used)
+            except ValueError:
+                print("Error parsing line:", line)
+                num_gpus -= 1  # Adjust num_gpus in case of parsing failure
+        
+        # Handle division by zero if no GPUs are found or parsed successfully
+        if num_gpus > 0:
+            avg_gpu_utilization = total_gpu_utilization / num_gpus
+            avg_memory_used = total_memory_used / num_gpus
+        else:
+            avg_gpu_utilization = 0.0
+            avg_memory_used = 0.0
+        
+        
+        # print(f"GPU Utilization: {avg_gpu_utilization}%")
+        # print(f"Memory Used: {avg_memory_used} MiB")
+        
+        return avg_gpu_utilization, avg_memory_used
+    except Exception as e:
+        print(f"Failed to get GPU utilization: {e}")
+        return 0, 0
+
+
+
+def get_cpu_utilization():
+    # Gets the system-wide CPU utilization
+    return psutil.cpu_percent(interval=1)
+
+def get_memory_usage():
+    # Gets the system-wide memory usage
+    memory_info = psutil.virtual_memory()
+    return memory_info.used / (1024 * 1024)  # Convert to MB
+
+def monitor_resources(start_time, interval=5, duration=60):
+    """
+    Monitors and collects resource usage metrics over a specified duration and interval.
+    
+    :param start_time: The time when the monitoring started, to calculate total duration.
+    :param interval: Time in seconds between each metric collection.
+    :param duration: Total duration to monitor resources.
+    :return: A dictionary containing the collected metrics.
+    """
+    metrics = {
+        'max_gpu_utilization': 0,
+        'max_memory_usage_gpu': 0,
+        'cpu_utilization': [],
+        'peak_memory_usage_system': 0,
+    }
+    
+    while True:
+        current_time = time.time()
+        if current_time - start_time > duration:
+            break
+        
+        gpu_utilization, memory_usage_gpu = get_gpu_utilization()
+        cpu_utilization = get_cpu_utilization()
+        memory_usage_system = get_memory_usage()
+        
+        metrics['max_gpu_utilization'] = max(metrics['max_gpu_utilization'], gpu_utilization)
+        metrics['max_memory_usage_gpu'] = max(metrics['max_memory_usage_gpu'], memory_usage_gpu)
+        metrics['cpu_utilization'].append(cpu_utilization)
+        metrics['peak_memory_usage_system'] = max(metrics['peak_memory_usage_system'], memory_usage_system)
+        
+        time.sleep(interval)
+    
+    return metrics
+
+def get_configs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-config-file",
+        help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+        type=str,
+        default="",
+    )
+    parser.add_argument(
+        "--publish-peft-with-id", 
+        help="The Hugging Face model ID to upload the trained model with",
+        type=str, 
+        default=""
+    )
+
+    args = parser.parse_args()
+    publish_peft_with_id = args.publish_peft_with_id
+    if len(publish_peft_with_id) == 0:
+        print(
+            "Please pass a --publish-peft-with-id if you want to upload the trained model"
+        )
+    else:
+        print(f"The trained model will be uploaded with id: {publish_peft_with_id}")
+        
+    # Load configs from JSON file (if specified)
+    if len(args.config_file) > 0:
+        if not os.path.isfile(args.config_file):
+            raise FileNotFoundError(f"Config file {args.config_file} not found.")
+        try:
+            with open(args.config_file) as f:
+                return json.load(f)
+        except json.JSONDecodeError as e:
+            print("JSON format error:")
+            print(e)
+    else:
+        # Define sample configs
+        ff_init_configs = {
+            # required parameters
+            "num_gpus": 1,
+            "memory_per_gpu": 8192,
+            "zero_copy_memory_per_node": 12000,
+            # optional parameters
+            "num_cpus": 4,
+            "legion_utility_processors": 4,
+            "data_parallelism_degree": 1,
+            "tensor_parallelism_degree": 1,
+            "pipeline_parallelism_degree": 1,
+            "offload": False,
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
+            "use_4bit_quantization": False,
+            "use_8bit_quantization": False,
+            "enable_peft": True,
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
+            "profiling": False,
+            "inference_debugging": True,
+            "fusion": True,
+        }
+        model_configs = {
+            # required parameters
+            "base_model": "JackFram/llama-160m",
+            "peft_model_ids": [
+                "goliaro/llama-160m-lora-full",
+            ],
+            # optional parameters
+            "cache_path": "~/.cache/flexflow",
+            "refresh_cache": False,
+            "full_precision": False,
+            "prompt": "",
+            "finetuning_dataset": os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "../prompt/peft.json"
+                # peft.json is a sample dataset for finetuning, should contain a list of strings
+            ),
+            "output_file": ""
+        }
+        # Merge dictionaries
+        ff_init_configs.update(model_configs)
+        ff_init_configs["publish_peft_with_id"] = publish_peft_with_id
+        return ff_init_configs
+
+
+def main():
+    start_time = time.time()
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+
+    # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+    ff.init(configs_dict)
+
+    # Create the FlexFlow LLM
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.base_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+        output_file=configs.output_file
+    )
+    for peft_model_id in configs.peft_model_ids:
+        llm.add_peft(peft_model_id)
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=1,
+        max_seq_length=256,
+        max_tokens_per_batch=64,
+    )
+    
+    resource_metrics = monitor_resources(start_time, interval=5, duration=360)
+
+    llm.start_server()
+    
+    print(f"LLM model class is: {llm.model_class}")
+
+    requests = []
+    # Serving
+    if len(configs.prompt) > 0:
+        prompts = [s for s in json.load(open(configs.prompt))]
+        inference_requests = [
+            ff.Request(
+                ff.RequestType.REQ_INFERENCE, prompt=prompt, max_sequence_length=128
+            )
+            for prompt in prompts
+        ]
+        requests += inference_requests
+    # Finetuning
+    if len(configs.finetuning_dataset) > 0:
+        for peft_model_id in configs.peft_model_ids:
+            finetuning_request = ff.Request(
+                ff.RequestType.REQ_FINETUNING,
+                max_sequence_length=128,
+                peft_model_id=llm.get_ff_peft_id(peft_model_id),
+                dataset_filepath=configs.finetuning_dataset,
+            )
+            requests.append(finetuning_request)
+            
+    # use the (finetuned) llm to generate some responses
+    llm.generate(requests)
+    
+    # After finishing the main workload, print the collected metrics.
+    avg_cpu_utilization = sum(resource_metrics['cpu_utilization']) / len(resource_metrics['cpu_utilization'])
+    print(f"Max GPU Utilization: {resource_metrics['max_gpu_utilization']}%")
+    print(f"Max GPU Memory Usage: {resource_metrics['max_memory_usage_gpu']} MiB")
+    print(f"Average CPU Utilization: {avg_cpu_utilization}%")
+    print(f"Peak System Memory Usage: {resource_metrics['peak_memory_usage_system']} MiB")
+
+    
+    llm.stop_server()
+    
+    # upload the model back to huggingface after finetuning
+    # the model format would be converted from flexflow format back to huggingface format
+    if len(configs.publish_peft_with_id) > 0:
+        print(
+            f"Done training! Uploading the model to HF hub with id: {configs.publish_peft_with_id}..."
+        )
+        llm.upload_peft_model(configs.publish_peft_with_id, private=True)
+    
+
+if __name__ == "__main__":
+    print("flexflow PEFT example")
+    
+    main()
\ No newline at end of file
diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
deleted file mode 100644
index 28b70bf565..0000000000
--- a/inference/utils/download_upload_hf.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python
-import argparse
-from huggingface_hub import HfApi, HfFolder
-import flexflow.serve as ff
-import warnings
-
-warnings.filterwarnings("ignore")
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
-    parser.add_argument("model_name", type=str, help="Original Hugging Face model ID to download and process (e.g., 'facebook/opt-125m').")
-    parser.add_argument("--new-model-id", type=str, required=True, help="New Hugging Face Hub model ID for upload (e.g., 'your_username/new-model-name').")
-    parser.add_argument("--cache-folder", type=str, default="~/.cache/flexflow", help="Folder to use to store and process the model(s) assets in FlexFlow format.")
-    parser.add_argument("--private", action="store_true", help="Whether to upload the processed model as a private model on Hugging Face Hub.")
-    parser.add_argument("--refresh-cache", action="store_true", help="Use this flag to force the refresh of the model(s) weights/tokenizer cache.")
-    parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights.")
-    return parser.parse_args()
-
-
-def main():
-    model_name = "tiiuae/falcon-7b"
-    new_model_id = "aprilyyt/falcon-upload-test-new"
-    cache_folder = "~/.cache/flexflow"
-    private = True
-    refresh_cache = False
-    full_precision = True
-
-    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
-    print(f"Downloading and processing model: {model_name}")
-    llm = ff.LLM(
-        model_name=model_name,
-        data_type=data_type,
-        cache_path=cache_folder,
-        refresh_cache=refresh_cache,
-    )
-    llm.download_hf_weights_if_needed()
-    llm.download_hf_tokenizer_if_needed()
-    llm.download_hf_config()
-
-    print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
-    llm.upload_hf_model(new_model_id, private=private)
-    print("Upload completed successfully.")
-
-if __name__ == "__main__":
-    main()
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 06d8262360..8ddface2ec 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -409,7 +409,6 @@ def upload_hf_model(self, new_model_id: str, private: bool = False):
         Uploads the model to the Hugging Face Hub, with reverse conversion of weights.
 
         :param new_model_id: The new model ID for the Hugging Face Hub.
-        :param model_path: The path where the FlexFlow weights are stored.
         :param private: Whether to upload the model as a private model.
         """
         print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
@@ -455,7 +454,7 @@ def upload_hf_model(self, new_model_id: str, private: bool = False):
         # Cleanup temporary directory
         shutil.rmtree(temp_dir)
 
-        print("Upload completed successfully.")
+        print("Upload process completed.")
         
         
         
@@ -464,7 +463,6 @@ def upload_peft_model(self, new_model_id: str, private: bool = False):
         Uploads the peft model to the Hugging Face Hub, with reverse conversion of weights.
 
         :param new_model_id: The new model ID for the Hugging Face Hub.
-        :param model_path: The path where the FlexFlow weights are stored.
         :param private: Whether to upload the model as a private model.
         """
         print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
@@ -510,7 +508,8 @@ def upload_peft_model(self, new_model_id: str, private: bool = False):
         # Cleanup temporary directory
         shutil.rmtree(temp_dir)
 
-        print("Upload completed successfully.")
+        print("Upload process completed.")
+        
 
     def compile(
         self,
@@ -909,19 +908,20 @@ def download_hf_weights_if_needed(self):
         else:
             print(f"Loading '{self.peft_model_id}' model weights from the cache...")
 
-    def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False):
+    def upload_hf_model(self, new_model_id: str, private: bool = False):
         """
-        Uploads the PEFT model to the Hugging Face Hub, with reverse conversion of weights.
-        
+        Uploads the model to the Hugging Face Hub, with reverse conversion of weights.
+
         :param new_model_id: The new model ID for the Hugging Face Hub.
-        :param model_path: The path where the FlexFlow weights are stored.
         :param private: Whether to upload the model as a private model.
         """
         print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
+        print("Tokenizer path is: ", self.tokenizer_path)
 
         # Initialize a new Hugging Face model instance
         hf_model = AutoModelForCausalLM.from_config(self.hf_config)
         weights_path = self.weights_path
+        print(f"Model class is: {self.model_class}")
 
         # Load FlexFlow weights into the Hugging Face model instance
         try:
@@ -934,6 +934,16 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         temp_dir = tempfile.mkdtemp()
         hf_model.save_pretrained(temp_dir)
 
+        # Copy the tokenizer files to the temporary directory
+        tokenizer_files = [f for f in os.listdir(self.tokenizer_path)]
+        for file_name in tokenizer_files:
+            shutil.copy(os.path.join(self.tokenizer_path, file_name), temp_dir)
+
+        # Delete rev_sha.txt from the temporary directory if it exists
+        rev_sha_path = os.path.join(temp_dir, 'rev_sha.txt')
+        if os.path.exists(rev_sha_path):
+            os.remove(rev_sha_path)
+
         # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
             print("Hugging Face token not found. Please login using `huggingface-cli login`.")
@@ -948,4 +958,4 @@ def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = Fal
         # Cleanup temporary directory
         shutil.rmtree(temp_dir)
 
-        print("Upload completed successfully.")
\ No newline at end of file
+        print("Upload process completed.")
\ No newline at end of file

From 2792e254aa4066e3f2b40056b79d9074c1781d58 Mon Sep 17 00:00:00 2001
From: April Yang <aprilytyang@gmail.com>
Date: Mon, 6 May 2024 17:23:05 +0000
Subject: [PATCH 49/55] add back utils script

---
 FlexFlow                                |  1 +
 inference/utils/download_upload_hf.py   | 51 +++++++++++++++++++++++++
 inference/utils/download_upload_peft.py |  2 +-
 3 files changed, 53 insertions(+), 1 deletion(-)
 create mode 160000 FlexFlow
 create mode 100644 inference/utils/download_upload_hf.py

diff --git a/FlexFlow b/FlexFlow
new file mode 160000
index 0000000000..d54e4b6a74
--- /dev/null
+++ b/FlexFlow
@@ -0,0 +1 @@
+Subproject commit d54e4b6a747f3940a19989a56095a71540e4c0d8
diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
new file mode 100644
index 0000000000..1588ef064d
--- /dev/null
+++ b/inference/utils/download_upload_hf.py
@@ -0,0 +1,51 @@
+
+# this script is for testing downloading a model from huggingface and uploading it back to huggingface
+# after the model is downloaded it will be transformed into flexflow format
+# before uploading it back to huggingface, we need to convert it back to huggingface format
+# which is done by calling llm.upload_hf_model()
+
+#!/usr/bin/env python
+import argparse
+from huggingface_hub import HfApi, HfFolder
+import flexflow.serve as ff
+import warnings
+
+warnings.filterwarnings("ignore")
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
+    parser.add_argument("model_name", type=str, help="Original Hugging Face model ID to download and process (e.g., 'facebook/opt-125m').")
+    parser.add_argument("--new-model-id", type=str, required=True, help="New Hugging Face Hub model ID for upload (e.g., 'your_username/new-model-name').")
+    parser.add_argument("--cache-folder", type=str, default="~/.cache/flexflow", help="Folder to use to store and process the model(s) assets in FlexFlow format.")
+    parser.add_argument("--private", action="store_true", help="Whether to upload the processed model as a private model on Hugging Face Hub.")
+    parser.add_argument("--refresh-cache", action="store_true", help="Use this flag to force the refresh of the model(s) weights/tokenizer cache.")
+    parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights.")
+    return parser.parse_args()
+
+
+def main():
+    model_name = "tiiuae/falcon-7b"
+    new_model_id = "aprilyyt/falcon-upload-test-new"
+    cache_folder = "~/.cache/flexflow"
+    private = True
+    refresh_cache = False
+    full_precision = True
+
+    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
+    print(f"Downloading and processing model: {model_name}")
+    llm = ff.LLM(
+        model_name=model_name,
+        data_type=data_type,
+        cache_path=cache_folder,
+        refresh_cache=refresh_cache,
+    )
+    llm.download_hf_weights_if_needed()
+    llm.download_hf_tokenizer_if_needed()
+    llm.download_hf_config()
+
+    print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
+    llm.upload_hf_model(new_model_id, private=private)
+    print("Upload completed successfully.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/inference/utils/download_upload_peft.py b/inference/utils/download_upload_peft.py
index c918c324c6..27dd1e5607 100644
--- a/inference/utils/download_upload_peft.py
+++ b/inference/utils/download_upload_peft.py
@@ -15,7 +15,7 @@ def parse_args():
 
 
 def main():
-    model_name = "meta-llama/Llama-2-7b"
+    model_name = "tiiuae/falcon-7b"
     new_model_id = "your_username/new-model-name"
     cache_folder = "~/.cache/flexflow"
     private = True

From a1b5db88b5c0876808668c8b0ce5d4abcd8cc582 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 29 May 2024 08:10:26 +0000
Subject: [PATCH 50/55] cleanup

---
 python/flexflow/serve/__init__.py         |  11 +-
 python/flexflow/serve/models/base.py      |   7 +-
 python/flexflow/serve/models/falcon.py    | 128 ++++++----
 python/flexflow/serve/models/llama.py     |  70 ++----
 python/flexflow/serve/models/mpt.py       |  74 +++---
 python/flexflow/serve/models/opt.py       |  79 +++---
 python/flexflow/serve/models/starcoder.py |  86 +++----
 python/flexflow/serve/serve.py            | 280 +---------------------
 8 files changed, 228 insertions(+), 507 deletions(-)

diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index fd29080a6a..6c0296768a 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -15,16 +15,7 @@
 from typing import Optional
 from ..type import *
 from flexflow.core import *
-from .serve import (
-    LLM,
-    SSM,
-    GenerationConfig,
-    GenerationResult,
-    LoraLinearConfig,
-    PEFTModelID,
-    Request,
-    RequestType,
-)
+from .serve import LLM, SSM
 
 
 def __check_positive_int(configs_dict: dict, key: str):
diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py
index fe39fd30bf..b38faedc3e 100644
--- a/python/flexflow/serve/models/base.py
+++ b/python/flexflow/serve/models/base.py
@@ -32,14 +32,11 @@ def __init__(
     def build_model(self):
         assert False, "Not implemented yet"
 
-    def convert_hf_weight_name(name):
+    def convert_weight_name_hf2ff(name):
         assert False, "Not implemented yet"
 
     def convert_hf_model(model, dst_folder):
         assert False, "Not implemented yet"
-        
-    def convert_ff_weight_name(name):
-        assert False, "Not implemented yet"
-        
+
     def load_weights_into_hf_model(model, src_folder):
         assert False, "Not implemented yet"
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 6eb4eb4dcf..660b80709c 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -245,12 +245,18 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
-    def convert_hf_weight_name(name):
-        return (name.replace("transformer.h.", "layers.")
+    def convert_weight_name_hf2ff(name):
+        return (
+            name.replace("transformer.h.", "layers.")
             .replace("transformer.", "")
             .replace("self_attention.dense", "self_attention.o_proj")
         )
 
+    def convert_weight_name_ff2hf(name):
+        return "transformer." + name.replace(
+            "self_attention.o_proj", "self_attention.dense"
+        ).replace("layers.", "h.")
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         n_head = (
@@ -259,12 +265,18 @@ def convert_hf_model(model, dst_folder):
             else model.config.num_attention_heads
         )
         for name, params in model.named_parameters():
-            name = FlexFlowFalcon.convert_hf_weight_name(name)
+            name = FlexFlowFalcon.convert_weight_name_hf2ff(name)
             # Split Q,K,V attention weights
             if "self_attention.query_key_value" in name:
-                name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj")
-                name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj")
-                name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj")
+                name_q = name.replace(
+                    "self_attention.query_key_value", "self_attention.q_proj"
+                )
+                name_k = name.replace(
+                    "self_attention.query_key_value", "self_attention.k_proj"
+                )
+                name_v = name.replace(
+                    "self_attention.query_key_value", "self_attention.v_proj"
+                )
                 q, k, v = torch.split(
                     params,
                     [
@@ -283,47 +295,35 @@ def convert_hf_model(model, dst_folder):
         model.lm_head.weight.detach().cpu().numpy().tofile(
             os.path.join(dst_folder, "lm_head.weight")
         )
-    
-    def convert_ff_weight_name(name):
-        
-        converted_name = name
-        converted_name = converted_name.replace("self_attention.o_proj", "self_attention.dense")
-        if name.startswith("ln") or name.startswith("word_embeddings"):
-            converted_name = "transformer." + converted_name
-        converted_name = re.sub(r"layers.(\d+).", r"transformer.h.\1.", converted_name)
-        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
-
-        return converted_name
-
 
     def load_weights_into_hf_model(model, src_folder):
         """
         Load weights from a specified folder and apply them to a Hugging Face model.
-        
+
         Parameters:
         - model: The instance of the Hugging Face model to load the weights into.
         - src_folder: The path to the folder containing the weight files.
         - config: The configuration object for the model.
         """
-        
+
         print(f"loading weights from {model} into {src_folder}")
-        
+
         hidden_size = model.config.hidden_size
         n_head = (
             model.config.n_head
             if "n_head" in model.config.__dict__
             else model.config.num_attention_heads
         )
-        
+
         print("Model hidden size:", hidden_size)
         print("Model num_attention_heads:", n_head)
-        
+
         # num_attention_heads = n_head
         # hidden_size_per_head = hidden_size // n_head
         # intermediate_size = hidden_size * 4
-        
+
         qkv_weights = {}
-        
+
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
             print("\nProcessing weight file:", weight_path)
@@ -331,31 +331,43 @@ def load_weights_into_hf_model(model, src_folder):
                 print("skipping rev_sha.txt")
                 continue
             else:
-                original_name = FlexFlowFalcon.convert_ff_weight_name(file_name.replace('.bin', ''))
+                original_name = FlexFlowFalcon.convert_weight_name_ff2hf(file_name)
                 print(f"Converted weight name from {file_name} to {original_name}")
-            
+
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
-            
+
             weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
-            print(f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}")
-            
+            print(
+                f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}"
+            )
+
             # for q,k,v weights, store in dict
-            if ("q_proj" in original_name) or ("k_proj" in original_name) or ("v_proj" in original_name):
-                
+            if (
+                ("q_proj" in original_name)
+                or ("k_proj" in original_name)
+                or ("v_proj" in original_name)
+            ):
+
                 layer_num_match = re.search(r"transformer.h.(\d+)", original_name)
                 layer_num = int(layer_num_match.group(1)) if layer_num_match else None
                 qkv_type = file_name.split(".")[-2]
                 print(f"qkv type for this weight is {qkv_type}")
-                
+
                 if layer_num is not None:
-                    qkv_key = f"transformer.h.{layer_num}.self_attention.query_key_value"
+                    qkv_key = (
+                        f"transformer.h.{layer_num}.self_attention.query_key_value"
+                    )
                     if qkv_key not in qkv_weights:
-                        qkv_weights[qkv_key] = {'q_proj': None, 'k_proj': None, 'v_proj': None}
-                    
+                        qkv_weights[qkv_key] = {
+                            "q_proj": None,
+                            "k_proj": None,
+                            "v_proj": None,
+                        }
+
                     qkv_weights[qkv_key][qkv_type] = weight_data
                 continue
-            
+
             # Handle non-QKV weights normally
             param = model.state_dict()[original_name]
             expected_numel = param.numel()
@@ -364,7 +376,7 @@ def load_weights_into_hf_model(model, src_folder):
                 # raise ValueError(f"Warning: {original_name} not found!")
                 print(f"Warning: {original_name} not found!")
                 continue
-            
+
             if weight_data.size != param.numel():
                 # print(f"shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
                 expected_shape = param.shape
@@ -374,25 +386,31 @@ def load_weights_into_hf_model(model, src_folder):
                     weight_data_reshaped = weight_data.reshape(new_shape)[0]
                     weight_tensor = torch.from_numpy(weight_data_reshaped)
                 else:
-                    raise ValueError(f"Shape mismatch and cannot convert for {original_name}")
+                    raise ValueError(
+                        f"Shape mismatch and cannot convert for {original_name}"
+                    )
             else:
                 weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
-        
+
             print(f"shape of the weight tensor is: {weight_tensor.shape}")
             with torch.no_grad():
                 model.state_dict()[original_name].copy_(weight_tensor)
                 print(f"Assigned weight {original_name} successfully!\n")
-                
+
         # Assign combined QKV weights
         for qkv_name, weights_dict in qkv_weights.items():
             print("\n========= Processing combined QKV weights ==========")
-            print(f"qkv name is {qkv_name}, hidden size is {hidden_size}, number of attention heads is {n_head}")
-            print(f"the weights dimensions are: {weights_dict['q_proj'].shape}, {weights_dict['k_proj'].shape}, {weights_dict['v_proj'].shape}")
+            print(
+                f"qkv name is {qkv_name}, hidden size is {hidden_size}, number of attention heads is {n_head}"
+            )
+            print(
+                f"the weights dimensions are: {weights_dict['q_proj'].shape}, {weights_dict['k_proj'].shape}, {weights_dict['v_proj'].shape}"
+            )
+
+            q_proj_weight = weights_dict["q_proj"]
+            k_proj_weight = weights_dict["k_proj"]
+            v_proj_weight = weights_dict["v_proj"]
 
-            q_proj_weight = weights_dict['q_proj']
-            k_proj_weight = weights_dict['k_proj']
-            v_proj_weight = weights_dict['v_proj']
-            
             print("Original QKV weights dimensions:")
             print("Q:", q_proj_weight.shape)
             print("K:", k_proj_weight.shape)
@@ -400,7 +418,7 @@ def load_weights_into_hf_model(model, src_folder):
 
             # Reshape the weights to match the expected shape
             q_proj_weight_reshaped = q_proj_weight.reshape(-1, hidden_size)
-            k_proj_weight_reshaped = k_proj_weight.reshape(-1, hidden_size // n_head) 
+            k_proj_weight_reshaped = k_proj_weight.reshape(-1, hidden_size // n_head)
             v_proj_weight_reshaped = v_proj_weight.reshape(-1, hidden_size // n_head)
             # q_proj_weight_reshaped = q_proj_weight.reshape(k_proj_weight_reshaped.shape[0], -1)
 
@@ -409,11 +427,19 @@ def load_weights_into_hf_model(model, src_folder):
             print("K:", k_proj_weight_reshaped.shape)
             print("V:", v_proj_weight_reshaped.shape)
 
-            combined_qkv = np.concatenate([q_proj_weight_reshaped, k_proj_weight_reshaped, v_proj_weight_reshaped], axis=1)
+            combined_qkv = np.concatenate(
+                [
+                    q_proj_weight_reshaped,
+                    k_proj_weight_reshaped,
+                    v_proj_weight_reshaped,
+                ],
+                axis=1,
+            )
             qkv_weight_name = qkv_name + ".weight"
             param_shape = model.state_dict()[qkv_weight_name].shape
-            print(f"param shape expected to be {param_shape}, qkv weights combined with weights size {combined_qkv.shape}")
+            print(
+                f"param shape expected to be {param_shape}, qkv weights combined with weights size {combined_qkv.shape}"
+            )
 
             model.state_dict()[qkv_weight_name].copy_(torch.from_numpy(combined_qkv))
             print(f"Assigned combined QKV weights to {qkv_weight_name}.")
-            
\ No newline at end of file
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index adf3fc31b2..8c561a7463 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -255,45 +255,21 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
-    def convert_hf_weight_name(name):
+    def convert_weight_name_hf2ff(name):
         return name.replace("model.", "")
 
+    def convert_weight_name_ff2hf(name):
+        if name == "lm_head.weight":
+            return name
+        else:
+            return "model." + name
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = FlexFlowLLAMA.convert_hf_weight_name(name)
+            name = FlexFlowLLAMA.convert_weight_name_hf2ff(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
 
-
-    def convert_ff_weight_name(name):
-        converted_name = (
-            name
-            .replace("w1", "gate_proj")
-            .replace("w2", "down_proj")
-            .replace("w3", "up_proj")
-            .replace("wq", "q_proj")
-            .replace("wk", "k_proj")
-            .replace("wv", "v_proj")
-            .replace("wo", "o_proj")
-            .replace("feed_forward_", "mlp.")
-            .replace("post_self_attn", "post_attention")
-            .replace("attention_norm", "input_layernorm")
-            .replace("tok_embeddings", "embed_tokens")
-            .replace("output", "lm_head")
-            
-        )
-        
-        converted_name = re.sub(r"layers_(\d+)_", r"layers.\1.", converted_name)
-        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
-        # converted_name = re.sub(r"attention_(?!norm)", "self_attn.", converted_name)
-        
-        converted_name = converted_name.replace("ffn_norm", "post_attention_layernorm")
-            
-        if "lm_head" not in converted_name:
-            converted_name = "model." + converted_name   
-                 
-        return converted_name
-    
     def load_weights_into_hf_model(model, src_folder):
         """
         Load weights from a specified folder and apply them to a Hugging Face model.
@@ -307,30 +283,36 @@ def load_weights_into_hf_model(model, src_folder):
             if weight_path.endswith("rev_sha.txt"):
                 print("skipping rev_sha.txt")
                 continue
-            else:
-                original_name = FlexFlowLLAMA.convert_ff_weight_name(file_name.replace('.bin', ''))
-                print(f"Converting weight name: {file_name} to {original_name}")
-            
+            original_name = FlexFlowLLAMA.convert_weight_name_ff2hf(file_name)
+            print(f"Converting weight name: {file_name} to {original_name}")
+
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
-            
-            weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
+
+            ff_dtype = np.float32 if "full-precision" in weight_path else np.float16
+            weight_data = np.fromfile(
+                weight_path, dtype=ff_dtype
+            )  # .astype(np.float32)
             if original_name not in model.state_dict():
                 raise KeyError(f"Parameter {original_name} not found in model.")
-            
+
             param = model.state_dict()[original_name]
             expected_numel = param.numel()
             if weight_data.size != expected_numel:
-                print(f"Adjusting shape for {original_name} from {weight_data.size} to {expected_numel}.")
+                print(
+                    f"Adjusting shape for {original_name} from {weight_data.size} to {expected_numel}."
+                )
                 if weight_data.size % expected_numel == 0:
                     factor = weight_data.size // expected_numel
                     new_shape = (factor,) + tuple(param.shape)
-                    weight_data_reshaped = weight_data.reshape(new_shape)[0] 
+                    weight_data_reshaped = weight_data.reshape(new_shape)[0]
                     weight_tensor = torch.from_numpy(weight_data_reshaped)
                 else:
-                    raise ValueError(f"Cannot adjust shape for {original_name} due to incompatible size.")
+                    raise ValueError(
+                        f"Cannot adjust shape for {original_name} due to incompatible size."
+                    )
             else:
                 weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
-            
+
             with torch.no_grad():
-                param.copy_(weight_tensor)
\ No newline at end of file
+                param.copy_(weight_tensor)
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 4abc5731ab..a17ac42d0d 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -255,17 +255,22 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
-    def convert_hf_weight_name(name):
+    def convert_weight_name_hf2ff(name):
         return (
             name.replace("transformer.blocks.", "layers.")
             .replace("transformer.", "")
             .replace("attn.out_proj", "attn.o_proj")
         )
 
+    def convert_weight_name_ff2hf(name):
+        return "transformer." + name.replace("attn.o_proj", "attn.out_proj").replace(
+            "layers.", "blocks."
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = FlexFlowMPT.convert_hf_weight_name(name)
+            name = FlexFlowMPT.convert_weight_name_hf2ff(name)
             if "Wqkv" in name:
                 name_q = name.replace("attn.Wqkv", "attn.q_proj")
                 name_k = name.replace("attn.Wqkv", "attn.k_proj")
@@ -290,37 +295,20 @@ def convert_hf_model(model, dst_folder):
             os.path.join(dst_folder, "lm_head.weight"),
         )
 
-
-    def convert_ff_weight_name(name):
-        # Reverses the conversion logic for MPT model weights
-        converted_name = name
-        if "norm_f" in converted_name or "wte" in converted_name:
-            converted_name = converted_name.replace("_", ".").replace("norm.f", "norm_f")
-            
-        converted_name = converted_name.replace("attn.o_proj", "attn.out_proj")
-        converted_name = converted_name.replace("ffn_", "ffn.")
-        converted_name = re.sub(r"layers.(\d+).", r"transformer.blocks.\1.", converted_name)
-        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
-        
-        if ("wte" in converted_name) or ("norm_f" in converted_name):
-            converted_name = "transformer." + converted_name
-        
-        return converted_name
-
     def load_weights_into_hf_model(model, src_folder):
         """
         Load weights from a specified folder and apply them to a Hugging Face MPT model.
-        
+
         Parameters:
         - model: The instance of the Hugging Face model to load the weights into.
         - src_folder: The path to the folder containing the weight files.
         """
-        
+
         d_model = model.config.d_model
         print("dimension of the model is: ", d_model)
-        
+
         qkv_weights = {}
-        
+
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
             if weight_path.endswith("rev_sha.txt"):
@@ -330,54 +318,60 @@ def load_weights_into_hf_model(model, src_folder):
                 print("skipping lm_head.weight")
                 continue
             else:
-                original_name = FlexFlowMPT.convert_ff_weight_name(file_name.replace('.bin', ''))
+                original_name = FlexFlowMPT.convert_weight_name_ff2hf(file_name)
                 print("\nconverting weights name of: ", file_name, "to ", original_name)
-                
+
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
-            
+
             weight_data = np.fromfile(weight_path, dtype=np.float32)
-            print(f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}")
-            
+            print(
+                f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}"
+            )
+
             # Special handling for combined QKV weights
-            if ("q_proj" in file_name) or ("k_proj" in file_name) or ("v_proj" in file_name):
+            if (
+                ("q_proj" in file_name)
+                or ("k_proj" in file_name)
+                or ("v_proj" in file_name)
+            ):
                 layer_num_match = re.search(r"layers\.(\d+)", original_name)
                 layer_num = int(layer_num_match.group(1)) if layer_num_match else None
                 qkv_type = original_name.split("_")[-2]
-                
+
                 if layer_num is not None:
                     qkv_key = f"layers.{layer_num}.attn_Wqkv"
                     # initialize qkv layer in dict
                     if qkv_key not in qkv_weights:
-                        qkv_weights[qkv_key] = {'wq': None, 'wk': None, 'wv': None}
+                        qkv_weights[qkv_key] = {"wq": None, "wk": None, "wv": None}
                         print(f"Initialized QKV layer {layer_num}")
                     # assign weights into dict
                     qkv_weights[qkv_key][qkv_type] = weight_data
-                
+
                 continue
-            
+
             # for weights that are not q,k,v, get the param names and assign weights accordingly
             param = model.state_dict().get(original_name, None)
             if weight_data.size != param.numel():
-                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
-            
+                raise ValueError(
+                    f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}"
+                )
+
             weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
             with torch.no_grad():
                 model.state_dict()[original_name].copy_(weight_tensor)
 
-                    
         for qkv_key, weights_dict in qkv_weights.items():
-            wq, wk, wv = weights_dict['wq'], weights_dict['wk'], weights_dict['wv']
+            wq, wk, wv = weights_dict["wq"], weights_dict["wk"], weights_dict["wv"]
             if None in (wq, wk, wv):
                 raise ValueError(f"Missing weights for {qkv_key}")
 
             combined_qkv = np.concatenate([wq, wk, wv], axis=0)
-            qkv_name = qkv_key.replace("layers.", "transformer.blocks.")+".weight"
-            
+            qkv_name = qkv_key.replace("layers.", "transformer.blocks.") + ".weight"
+
             param_shape = model.state_dict()[qkv_name].shape
             combined_qkv_reshaped = combined_qkv.reshape(param_shape)
 
             model.state_dict()[qkv_name].copy_(torch.from_numpy(combined_qkv_reshaped))
 
             print(f"Assigned combined QKV weights to {qkv_key}.")
-                
\ No newline at end of file
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index e346f39361..5aaf34ce03 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -286,7 +286,7 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
-    def convert_hf_weight_name(name):
+    def convert_weight_name_hf2ff(name):
         return (
             name.replace("decoder.", "")
             .replace("model.", "")
@@ -297,90 +297,79 @@ def convert_hf_weight_name(name):
             )  # important to use the leading "_" to avoid matching the last LayerNorm
         )
 
+    def convert_weight_name_ff2hf(name):
+        return (
+            ("model.decoder." + name)
+            .replace(".add_bias_residual_layer_norm", ".final_layer_norm")
+            .replace("add_bias_residual_layer_norm.attn_bias", "self_attn.o_proj.bias")
+            .replace("self_attn.o_proj", "self_attn.out_proj")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = FlexFlowOPT.convert_hf_weight_name(name)
+            name = FlexFlowOPT.convert_weight_name_hf2ff(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
         # copy embedding weights
         shutil.copy(
             os.path.join(dst_folder, "embed_tokens.weight"),
             os.path.join(dst_folder, "lm_head.weight"),
         )
-        
-    def convert_ff_weight_name(name):
-        # Reverse the previous conversion rules
-        converted_name = (
-            name
-            .replace("add_bias_residual_layer_norm.attn_bias", "attention_wo_bias")
-            .replace(".add_bias_residual_layer_norm", ".final_layer_norm")
-            .replace("wq", "q_proj")
-            .replace("wk", "k_proj")
-            .replace("wv", "v_proj")
-            .replace("wo", "out_proj")
-            .replace("self_attn.o_proj", "self_attn.out_proj")
-            .replace("attention", "self_attn")
-        )
-        
-        converted_name = re.sub(r"layers_(\d+)_", r"layers.\1.", converted_name)
-        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
-        converted_name = re.sub(r"self_attn_(?!layer_norm)", "self_attn.", converted_name)
-        converted_name = converted_name.replace("embed_tokens_weight_lm_head", "embed_tokens.weight")
-        
-        # Prepend "model.decoder." to the weight name
-        if not converted_name.startswith("model.decoder.") and "lm_head" not in converted_name:
-            converted_name = "model.decoder." + converted_name
-        
-        return converted_name
-
 
     def load_weights_into_hf_model(model, src_folder):
         """
         Load weights from a specified folder and apply them to a Hugging Face model.
-        
-        This function iterates through the weight files in the specified folder, 
-        converts the FlexFlow weight names to Hugging Face format, and loads the 
-        weights into the Hugging Face model. It handles special cases like shape 
+
+        This function iterates through the weight files in the specified folder,
+        converts the FlexFlow weight names to Hugging Face format, and loads the
+        weights into the Hugging Face model. It handles special cases like shape
         mismatches by adjusting the weights accordingly.
-        
+
         Parameters:
         - model: The instance of the Hugging Face model to load the weights into.
         - src_folder: The path to the folder containing the weight files.
         """
-        
+
         for file_name in os.listdir(src_folder):
             weight_path = os.path.join(src_folder, file_name)
             print("Converting weight name:", weight_path)
-            
+
             if weight_path.endswith("rev_sha.txt"):
                 print("Skipping rev_sha.txt")
                 continue
 
-            original_name = FlexFlowOPT.convert_ff_weight_name(file_name.replace('.bin', ''))
-            print("Original name of the weights is:", original_name)
+            original_name = FlexFlowOPT.convert_weight_name_ff2hf(file_name)
+            print(f"Converting weight name: {file_name} to {original_name}")
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
-            
-            weight_data = np.fromfile(weight_path, dtype=np.float16).astype(np.float32)
+
+            ff_dtype = np.float32 if "full-precision" in weight_path else np.float16
+            weight_data = np.fromfile(
+                weight_path, dtype=ff_dtype
+            )  # .astype(np.float32)
             if original_name not in model.state_dict():
                 raise KeyError(f"Parameter {original_name} not found in model.")
             param = model.state_dict()[original_name]
-            
+
             # Calculate the reshape size automatically based on expected parameter size
             expected_numel = param.numel()
             if weight_data.size != expected_numel:
-                print(f"Adjusting shape for {original_name} from {weight_data.size} to {expected_numel}")
+                print(
+                    f"Adjusting shape for {original_name} from {weight_data.size} to {expected_numel}"
+                )
                 # Check if weight_data can be evenly divided by expected_numel
                 if weight_data.size % expected_numel == 0:
                     # Determine the reshape size
                     factor = weight_data.size // expected_numel
-                    new_shape = (factor, ) + tuple(param.shape)
+                    new_shape = (factor,) + tuple(param.shape)
                     weight_data_reshaped = weight_data.reshape(new_shape)
                     weight_tensor = torch.from_numpy(weight_data_reshaped[0])
                 else:
-                    raise ValueError(f"Cannot adjust shape for {original_name} due to incompatible size.")
+                    raise ValueError(
+                        f"Cannot adjust shape for {original_name} due to incompatible size."
+                    )
             else:
                 weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
-            
+
             with torch.no_grad():
-                model.state_dict()[original_name].copy_(weight_tensor)
\ No newline at end of file
+                param.copy_(weight_tensor)
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 41f534ef1f..d52e03aecf 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -226,7 +226,7 @@ def build_model(self, max_tokens_per_batch):
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.h", "layers").replace("transformer", "")
+            name = name.replace("transformer.h", "layers").replace("transformer.", "")
             if "attn.c_attn.weight" in name:
                 name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
                 name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
@@ -270,25 +270,10 @@ def convert_hf_model(model, dst_folder):
         model.lm_head.weight.detach().cpu().numpy().tofile(
             os.path.join(dst_folder, "lm_head.weight")
         )
-        
-        
-    def convert_ff_weight_name(name):
-        """
-        Convert weight names from FlexFlow format back to Hugging Face format.
-        """
-        converted_name = name
-        converted_name = converted_name.replace("attn.c_attn.o_proj", "attn.c_proj")
-        
-        converted_name = converted_name.replace("mlp_", "mlp.").replace("_ln_f", ".ln_f").replace("_wpe", ".wpe").replace("_wte", ".wte")
-        if ("ln_f" in converted_name) or ("wpe" in converted_name) or ("wte" in converted_name):
-            converted_name = "transformer"+converted_name
-        converted_name = re.sub(r"layers.(\d+).", r"transformer.h.\1.", converted_name)
-        converted_name = re.sub(r"_(bias|weight)$", r".\1", converted_name)
-        
-
-        return converted_name
-    
-    
+
+    def convert_weight_name_ff2hf(name):
+        return "transformer." + name.replace("layers.", "h.")
+
     def load_weights_into_hf_model(model, src_folder):
         """
         Load weights from a specified folder and apply them to a Hugging Face model.
@@ -297,20 +282,17 @@ def load_weights_into_hf_model(model, src_folder):
         - model: The instance of the Hugging Face model to load the weights into.
         - src_folder: The path to the folder containing the weight files.
         """
-        
+
         hidden_size = model.config.hidden_size
         n_head = (
             model.config.n_head
             if "n_head" in model.config.__dict__
             else model.config.num_attention_heads
         )
-        
+
         print("Model hidden size:", hidden_size)
         print("Model num_attention_heads:", n_head)
-        
-        num_attention_heads = n_head
-        hidden_size_per_head = hidden_size // n_head
-        
+
         qkv_weights = {}
 
         for file_name in os.listdir(src_folder):
@@ -320,57 +302,75 @@ def load_weights_into_hf_model(model, src_folder):
                 print("skipping rev_sha.txt")
                 continue
             else:
-                original_name = FlexFlowSTARCODER.convert_ff_weight_name(file_name.replace('.bin', ''))
+                original_name = FlexFlowSTARCODER.convert_weight_name_ff2hf(file_name)
                 print(f"Converted weight name: {file_name} to {original_name}")
-            
+
             if not os.path.exists(weight_path):
                 raise FileNotFoundError(f"No weight file found for {file_name}")
 
             weight_data = np.fromfile(weight_path, dtype=np.float32)
-            print(f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}")
-            
+            print(
+                f"Data type after conversion: {weight_data.dtype}, Size: {weight_data.size}"
+            )
+
             # Special handling for combined QKV weights
-            if ("q_proj" in original_name) or ("k_proj" in original_name) or ("v_proj" in original_name):
+            if (
+                ("q_proj" in original_name)
+                or ("k_proj" in original_name)
+                or ("v_proj" in original_name)
+            ):
                 weight_bias = ".weight" if ".weight" in original_name else ".bias"
                 layer_num_match = re.search(r"layers.(\d+)", file_name)
                 layer_num = int(layer_num_match.group(1)) if layer_num_match else None
                 print(f"layer_num is {layer_num}")
                 qkv_type = file_name.split("_")[-2]
                 qkv_name = f"transformer.h.{layer_num}.attn.c_attn" + weight_bias
-                
+
                 if layer_num is not None:
                     # initialize qkv layer in dict
                     if qkv_name not in qkv_weights:
-                        qkv_weights[qkv_name] = {'attn.q': None, 'attn.k': None, 'attn.v': None}
+                        qkv_weights[qkv_name] = {
+                            "attn.q": None,
+                            "attn.k": None,
+                            "attn.v": None,
+                        }
                         print(f"Initialized QKV layer {layer_num}")
                     # assign weights into dict
                     qkv_weights[qkv_name][qkv_type] = weight_data
-                    print(f"attached qkv weight {qkv_name} for type {qkv_type}, weight data dimension is {weight_data.shape}")
-                
+                    print(
+                        f"attached qkv weight {qkv_name} for type {qkv_type}, weight data dimension is {weight_data.shape}"
+                    )
+
                 continue
-            
+
             # Handling for other parameters
             # for weights that are not q,k,v, get the param names and assign weights accordingly
             param = model.state_dict().get(original_name, None)
             print(f"Param name: {original_name}")
             if weight_data.size != param.numel():
-                raise ValueError(f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}")
-            
+                raise ValueError(
+                    f"Shape mismatch for {original_name}, model expects {param.numel()} elements, got {weight_data.size}"
+                )
+
             weight_tensor = torch.from_numpy(weight_data).reshape(param.shape)
             print(f"shape of the weight tensor is: {weight_tensor.shape}")
             with torch.no_grad():
                 model.state_dict()[original_name].copy_(weight_tensor)
                 print(f"Assigned weight {original_name} successfully!\n")
-                
-                
+
         for qkv_name, weights_dict in qkv_weights.items():
             print(f"qkv name is {qkv_name}, with weight {weights_dict}")
-            combined_qkv = np.concatenate([qkv_weights[qkv_name]['attn.q'], qkv_weights[qkv_name]['attn.k'], qkv_weights[qkv_name]['attn.v']], axis=0)
+            combined_qkv = np.concatenate(
+                [
+                    qkv_weights[qkv_name]["attn.q"],
+                    qkv_weights[qkv_name]["attn.k"],
+                    qkv_weights[qkv_name]["attn.v"],
+                ],
+                axis=0,
+            )
             param_shape = model.state_dict()[qkv_name].shape
             combined_qkv_reshaped = combined_qkv.reshape(param_shape)
             print(f"reshaped qkv weights shape is: {combined_qkv_reshaped.shape}")
 
             model.state_dict()[qkv_name].copy_(torch.from_numpy(combined_qkv_reshaped))
             print(f"Assigned combined QKV weights to {qkv_name}.")
-        
-        
\ No newline at end of file
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 06da1ff6ff..c7b03aeba7 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -31,48 +31,10 @@
 from peft import PeftModel, PeftConfig
 from huggingface_hub import HfApi, HfFolder, Repository
 import torch, shutil, hashlib, json, gc, os
-from huggingface_hub import HfApi, HfFolder, Repository
-import torch, shutil, hashlib, json, gc, os
 from typing import Union, List
 import tempfile
 
 
-
-class GenerationConfig:
-    """A class to store the sampling configs."""
-
-    def __init__(
-        self,
-        do_sample: bool = False,
-        temperature: float = 0.9,
-        topp: float = 0.8,
-        topk: int = 1,
-    ):
-        """Initialize the sampling configs
-
-        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
-        :type do_sample: bool, optional
-        :param temperature: The temperature setting, defaults to 0.9
-        :type temperature: float, optional
-        :param topp: The top probabilities (top-p) setting, defaults to 0.8
-        :type topp: float, optional
-        :param topk: The top-k setting, defaults to 1
-        :type topk: int, optional
-        """
-        self.do_sample = do_sample
-        self.temperature = temperature
-        self.topp = topp
-        self.topk = topk
-
-
-class GenerationResult:
-    """A class to store the output of a generation request."""
-
-    def __init__(self, text: str = None, tokens: list = None):
-        self.output_text = text
-        self.output_tokens = tokens
-
-
 class _SupportedModels:
     def __init__(
         self,
@@ -180,7 +142,6 @@ def get_ff_peft_id(self, peft_model_id: str) -> PEFTModelID:
             )
         return peft_dict["ff_peft_model_id"]
 
-
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
         config_dir = os.path.join(
@@ -300,7 +261,7 @@ def convert_peft_model(hf_peft_model, peft_type, weights_path):
                     name = name.replace("base_model.model.model.", "").replace(
                         ".default", ""
                     )
-                    name = self.model_class.convert_hf_weight_name(name)
+                    name = self.model_class.convert_weight_name_hf2ff(name)
                     params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
 
         def download_peft_weights():
@@ -385,25 +346,6 @@ def download_hf_tokenizer_if_needed(self):
         else:
             print(f"Loading '{self.model_name}' tokenizer from the cache...")
 
-    def __load_hf_weights(self):
-        print("Loading hf weights...")
-
-        self.download_hf_weights_if_needed()
-
-        # Create file data loader, load weights into tensors
-        model_configs = self.config_class(self.hf_config)
-
-        self.fileloader = FileDataLoader(
-            self.weights_path,
-            model_configs.num_attention_heads,
-            model_configs.num_key_value_heads,
-            model_configs.hidden_size,
-            model_configs.hidden_size // model_configs.num_attention_heads,
-            self.ffconfig.tensor_parallelism_degree,
-        )
-
-        self.fileloader.load_weights(self.model.ffmodel, self.data_type)
-
     def upload_hf_model(self, new_model_id: str, private: bool = False):
         """
         Uploads the model to the Hugging Face Hub, with reverse conversion of weights.
@@ -436,13 +378,15 @@ def upload_hf_model(self, new_model_id: str, private: bool = False):
             shutil.copy(os.path.join(self.tokenizer_path, file_name), temp_dir)
 
         # Delete rev_sha.txt from the temporary directory if it exists
-        rev_sha_path = os.path.join(temp_dir, 'rev_sha.txt')
+        rev_sha_path = os.path.join(temp_dir, "rev_sha.txt")
         if os.path.exists(rev_sha_path):
             os.remove(rev_sha_path)
 
         # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
-            print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+            print(
+                "Hugging Face token not found. Please login using `huggingface-cli login`."
+            )
             return
 
         # Upload the model
@@ -455,9 +399,7 @@ def upload_hf_model(self, new_model_id: str, private: bool = False):
         shutil.rmtree(temp_dir)
 
         print("Upload process completed.")
-        
-        
-        
+
     def upload_peft_model(self, new_model_id: str, private: bool = False):
         """
         Uploads the peft model to the Hugging Face Hub, with reverse conversion of weights.
@@ -490,13 +432,15 @@ def upload_peft_model(self, new_model_id: str, private: bool = False):
             shutil.copy(os.path.join(self.tokenizer_path, file_name), temp_dir)
 
         # Delete rev_sha.txt from the temporary directory if it exists
-        rev_sha_path = os.path.join(temp_dir, 'rev_sha.txt')
+        rev_sha_path = os.path.join(temp_dir, "rev_sha.txt")
         if os.path.exists(rev_sha_path):
             os.remove(rev_sha_path)
 
         # Ensure Hugging Face CLI is logged in
         if not HfFolder.get_token():
-            print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+            print(
+                "Hugging Face token not found. Please login using `huggingface-cli login`."
+            )
             return
 
         # Upload the model
@@ -509,7 +453,6 @@ def upload_peft_model(self, new_model_id: str, private: bool = False):
         shutil.rmtree(temp_dir)
 
         print("Upload process completed.")
-        
 
     def compile(
         self,
@@ -604,8 +547,7 @@ def compile(
 
         self.rm.set_max_spec_tree_token_num(
             model_configs.max_spec_tree_token_num
-            if "max_spec_tree_token_num"
-            in model_configs.__dict__
+            if "max_spec_tree_token_num" in model_configs.__dict__
             else 20
         )
 
@@ -766,203 +708,3 @@ def compile(
             model_specific_pipeline_parallelism_degree,
             ssms,
         )
-
-class PEFT:
-    """This class creates a PEFT (parameter-efficient transformer) object to be used in concert with a LLM or SSM"""
-
-    def __init__(
-        self,
-        peft_model_id: str,
-        data_type: DataType = DataType.DT_HALF,
-        cache_path: str = "",
-        refresh_cache: bool = False,
-    ):
-        self.hf_config = PeftConfig.from_pretrained(peft_model_id)
-        self.peft_model_id = peft_model_id
-        self.peft_type = self.hf_config.peft_type
-        if self.peft_type != "LORA":
-            raise RuntimeError(
-                f"PEFT type {self.peft_type} not yet supported in FlexFlow"
-            )
-        self.data_type = data_type
-        assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT
-        self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
-        self.refresh_cache = refresh_cache
-        # Base model related
-        if "base_model_name_or_path" not in self.hf_config.to_dict():
-            raise ValueError(
-                f"PEFT model {peft_model_id} does not have an associated based model"
-            )
-        self.base_model = LLM(
-            self.hf_config.base_model_name_or_path, data_type, cache_path, refresh_cache
-        )
-
-    def download_hf_config(self):
-        """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
-        self.config_dir = os.path.join(
-            os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower()
-        )
-        self.config_path = os.path.join(self.config_dir, "config.json")
-        os.makedirs(self.config_dir, exist_ok=True)
-        print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
-        print(f"Saving {self.peft_model_id} configs to file {self.config_path}...")
-        with open(self.config_path, "w") as json_file:
-            class SetEncoder(json.JSONEncoder):
-                def default(self, obj):
-                    if isinstance(obj, set):
-                        return list(obj)
-                    return super().default(obj)
-            json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder)
-
-    def __get_revision_hashes(self, peft_model_id: str):
-        ff_revision = None
-        ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt")
-        if os.path.exists(ff_revision_file):
-            ff_revision = "".join(open(ff_revision_file).read().split())
-
-        if os.path.exists(peft_model_id) and os.path.isdir(peft_model_id):
-            # Local model
-            files = os.listdir(peft_model_id)
-            state = files + [
-                os.path.getmtime(os.path.join(peft_model_id, f)) for f in files
-            ]
-            latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest()
-        else:
-            # Remote HuggingFace model
-            hf_api = HfApi()
-            latest_revision = hf_api.model_info(self.peft_model_id).sha
-        return ff_revision, ff_revision_file, latest_revision
-
-    def convert_peft_model(self, hf_peft_model, weights_path):
-        for name, params in hf_peft_model.named_parameters():
-            if self.peft_type.lower() in name:
-                name = name.replace("base_model.model.model.", "").replace(
-                    ".default", ""
-                )
-                name = self.base_model.model_class.convert_hf_weight_name(name)
-                params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
-
-    def download_hf_weights_if_needed(self):
-        """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date.
-        If not, or if the refresh_cache parameter is set to True, download new weights.
-        """
-        # Use local cache, or download new version
-        self.weights_path = os.path.join(
-            os.path.expanduser(self.cache_path),
-            "weights",
-            self.peft_model_id.lower(),
-            "full-precision"
-            if self.data_type == DataType.DT_FLOAT
-            else "half-precision",
-        )
-        if self.refresh_cache:
-            print(
-                f"Refreshing weights in cache for model {self.peft_model_id} at path {self.weights_path} ..."
-            )
-            if os.path.exists(self.weights_path):
-                shutil.rmtree(self.weights_path)
-        os.makedirs(self.weights_path, exist_ok=True)
-        print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
-
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.peft_model_id
-        )
-
-        # Download if needed
-        if ff_revision != latest_revision:
-            if not os.path.exists(self.peft_model_id) or os.path.isdir(
-                self.peft_model_id
-            ):
-                # Local model
-                print(
-                    f"'{self.peft_model_id}' model weights not found in cache or outdated. Downloading from huggingface.co ..."
-                )
-            else:
-                # Remote model
-                print(
-                    f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..."
-                )
-            # Download base model from HuggingFace, or load it from the local folder
-            self.base_model.download_hf_weights_if_needed()
-            self.base_model.download_hf_tokenizer_if_needed()
-            self.base_model.download_hf_config()
-            hf_base_model = AutoModelForCausalLM.from_pretrained(
-                self.hf_config.base_model_name_or_path,
-                return_dict=True,
-                trust_remote_code=True,
-                torch_dtype=torch.float32
-                if self.data_type == DataType.DT_FLOAT
-                else torch.float16,
-                # device_map="auto",
-            )
-            hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id)
-            # Print log message to notify user download of model has finished
-            if not os.path.exists(self.peft_model_id) or os.path.isdir(
-                self.peft_model_id
-            ):
-                print("Done downloading HF weights. Converting them now...")
-            # Convert the model to FlexFlow format
-            self.convert_peft_model(hf_peft_model, self.weights_path)
-            # Save new revision hash to file
-            with open(ff_revision_file, "w+") as f:
-                f.write(latest_revision)
-            print("Done converting the weights...")
-            # Deallocate hf model
-            del hf_peft_model
-            del hf_base_model
-            gc.collect()
-            torch.cuda.empty_cache()
-        else:
-            print(f"Loading '{self.peft_model_id}' model weights from the cache...")
-
-    def upload_hf_model(self, new_model_id: str, private: bool = False):
-        """
-        Uploads the model to the Hugging Face Hub, with reverse conversion of weights.
-
-        :param new_model_id: The new model ID for the Hugging Face Hub.
-        :param private: Whether to upload the model as a private model.
-        """
-        print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
-        print("Tokenizer path is: ", self.tokenizer_path)
-
-        # Initialize a new Hugging Face model instance
-        hf_model = AutoModelForCausalLM.from_config(self.hf_config)
-        weights_path = self.weights_path
-        print(f"Model class is: {self.model_class}")
-
-        # Load FlexFlow weights into the Hugging Face model instance
-        try:
-            self.model_class.load_weights_into_hf_model(hf_model, weights_path)
-        except Exception as e:
-            print(f"Error loading weights into model: {e}")
-            return
-
-        # Save the model with converted weights to a temporary directory
-        temp_dir = tempfile.mkdtemp()
-        hf_model.save_pretrained(temp_dir)
-
-        # Copy the tokenizer files to the temporary directory
-        tokenizer_files = [f for f in os.listdir(self.tokenizer_path)]
-        for file_name in tokenizer_files:
-            shutil.copy(os.path.join(self.tokenizer_path, file_name), temp_dir)
-
-        # Delete rev_sha.txt from the temporary directory if it exists
-        rev_sha_path = os.path.join(temp_dir, 'rev_sha.txt')
-        if os.path.exists(rev_sha_path):
-            os.remove(rev_sha_path)
-
-        # Ensure Hugging Face CLI is logged in
-        if not HfFolder.get_token():
-            print("Hugging Face token not found. Please login using `huggingface-cli login`.")
-            return
-
-        # Upload the model
-        api = HfApi()
-        print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
-        api.create_repo(repo_id=new_model_id, private=private, exist_ok=True)
-        api.upload_folder(folder_path=temp_dir, repo_id=new_model_id)
-
-        # Cleanup temporary directory
-        shutil.rmtree(temp_dir)
-
-        print("Upload process completed.")
\ No newline at end of file

From 613eb6d6be14a8af23d481cf0bf1c99f8a0857fc Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 29 May 2024 08:12:07 +0000
Subject: [PATCH 51/55] remove submodule

---
 FlexFlow | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 FlexFlow

diff --git a/FlexFlow b/FlexFlow
deleted file mode 160000
index d54e4b6a74..0000000000
--- a/FlexFlow
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit d54e4b6a747f3940a19989a56095a71540e4c0d8

From f73d556262f212510fa93fe4e38ab7fa1e81a9a7 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 5 Jun 2024 12:46:14 -0400
Subject: [PATCH 52/55] add test

---
 inference/utils/download_upload_hf.py | 51 -----------------------
 inference/utils/upload_hf_model.py    | 55 ++++++++++++++++++++++++
 python/flexflow/serve/serve.py        | 54 +++++++++++-------------
 tests/upload_test.sh                  | 60 +++++++++++++++++++++++++++
 4 files changed, 140 insertions(+), 80 deletions(-)
 delete mode 100644 inference/utils/download_upload_hf.py
 create mode 100644 inference/utils/upload_hf_model.py
 create mode 100644 tests/upload_test.sh

diff --git a/inference/utils/download_upload_hf.py b/inference/utils/download_upload_hf.py
deleted file mode 100644
index 1588ef064d..0000000000
--- a/inference/utils/download_upload_hf.py
+++ /dev/null
@@ -1,51 +0,0 @@
-
-# this script is for testing downloading a model from huggingface and uploading it back to huggingface
-# after the model is downloaded it will be transformed into flexflow format
-# before uploading it back to huggingface, we need to convert it back to huggingface format
-# which is done by calling llm.upload_hf_model()
-
-#!/usr/bin/env python
-import argparse
-from huggingface_hub import HfApi, HfFolder
-import flexflow.serve as ff
-import warnings
-
-warnings.filterwarnings("ignore")
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub.")
-    parser.add_argument("model_name", type=str, help="Original Hugging Face model ID to download and process (e.g., 'facebook/opt-125m').")
-    parser.add_argument("--new-model-id", type=str, required=True, help="New Hugging Face Hub model ID for upload (e.g., 'your_username/new-model-name').")
-    parser.add_argument("--cache-folder", type=str, default="~/.cache/flexflow", help="Folder to use to store and process the model(s) assets in FlexFlow format.")
-    parser.add_argument("--private", action="store_true", help="Whether to upload the processed model as a private model on Hugging Face Hub.")
-    parser.add_argument("--refresh-cache", action="store_true", help="Use this flag to force the refresh of the model(s) weights/tokenizer cache.")
-    parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights.")
-    return parser.parse_args()
-
-
-def main():
-    model_name = "tiiuae/falcon-7b"
-    new_model_id = "aprilyyt/falcon-upload-test-new"
-    cache_folder = "~/.cache/flexflow"
-    private = True
-    refresh_cache = False
-    full_precision = True
-
-    data_type = ff.DataType.DT_FLOAT if full_precision else ff.DataType.DT_HALF
-    print(f"Downloading and processing model: {model_name}")
-    llm = ff.LLM(
-        model_name=model_name,
-        data_type=data_type,
-        cache_path=cache_folder,
-        refresh_cache=refresh_cache,
-    )
-    llm.download_hf_weights_if_needed()
-    llm.download_hf_tokenizer_if_needed()
-    llm.download_hf_config()
-
-    print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
-    llm.upload_hf_model(new_model_id, private=private)
-    print("Upload completed successfully.")
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/inference/utils/upload_hf_model.py b/inference/utils/upload_hf_model.py
new file mode 100644
index 0000000000..59e4573461
--- /dev/null
+++ b/inference/utils/upload_hf_model.py
@@ -0,0 +1,55 @@
+
+# this script is for testing downloading a model from huggingface and uploading it back to huggingface
+# after the model is downloaded it will be transformed into flexflow format
+# before uploading it back to huggingface, we need to convert it back to huggingface format
+# which is done by calling llm.upload_hf_model()
+
+#!/usr/bin/env python
+import argparse, os
+import flexflow.serve as ff
+import warnings
+
+warnings.filterwarnings("ignore")
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Download a model with FlexFlow, process it, and upload it to the Hugging Face Hub."
+    )
+    parser.add_argument(
+        "model_name",
+        type=str,
+        help="Original Hugging Face model ID to download and process (e.g., 'facebook/opt-125m')."
+    )
+    parser.add_argument(
+        "--new-model-id", 
+        type=str, 
+        required=True, 
+        help="New Hugging Face Hub model ID for upload (e.g., 'your_username/new-model-name')."
+    )
+    parser.add_argument(
+        "--cache-folder",
+        type=str,
+        help="Folder to use to store the model(s) assets in FlexFlow format",
+        default=os.environ.get("FF_CACHE_PATH", ""),
+    )
+    parser.add_argument("--private", action="store_true", help="Whether to upload the processed model as a private model on Hugging Face Hub.")
+    parser.add_argument("--full-precision", action="store_true", help="Download the full precision version of the weights.")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    data_type = ff.DataType.DT_FLOAT if args.full_precision else ff.DataType.DT_HALF
+    print(f"Downloading and processing model: {args.model_name}")
+    llm = ff.LLM(
+        model_name=args.model_name,
+        data_type=data_type,
+        cache_path=args.cache_folder,
+        refresh_cache=False,
+    )
+    print(f"Uploading processed model to Hugging Face Hub: {args.new_model_id}")
+    llm.upload_hf_model(args.new_model_id, private=args.private)
+    print("Upload completed successfully.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index c7b03aeba7..08807a91c2 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -100,6 +100,12 @@ def __init__(
         self.data_type = data_type
         assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT
         self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
+        self.weights_path = self.__get_weights_path(self.model_name)
+        self.tokenizer_path = os.path.join(
+            os.path.expanduser(self.cache_path),
+            "tokenizers",
+            self.model_name.lower(),
+        )
         self.refresh_cache = refresh_cache
         self.output_file = output_file
         self.rm = None
@@ -194,6 +200,18 @@ def __get_revision_hashes(self, model_name: str, folder: str):
             latest_revision = hf_api.model_info(self.model_name).sha
         return ff_revision, ff_revision_file, latest_revision
 
+    def __get_weights_path(self, model_name):
+        return os.path.join(
+            os.path.expanduser(self.cache_path),
+            "weights",
+            model_name.lower(),
+            (
+                "full-precision"
+                if self.data_type == DataType.DT_FLOAT
+                else "half-precision"
+            ),
+        )
+    
     def download_hf_weights_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
         If not, or if the refresh_cache parameter is set to True, download new weights.
@@ -201,20 +219,8 @@ def download_hf_weights_if_needed(self):
         If any PEFT adapter is registered, perform the same operation for PEFT.
         """
 
-        def get_weights_path(model_name):
-            return os.path.join(
-                os.path.expanduser(self.cache_path),
-                "weights",
-                model_name.lower(),
-                (
-                    "full-precision"
-                    if self.data_type == DataType.DT_FLOAT
-                    else "half-precision"
-                ),
-            )
-
         def refresh_cache_if_needed(model_name):
-            weights_path = get_weights_path(model_name)
+            weights_path = self.__get_weights_path(model_name)
             if self.refresh_cache:
                 print(
                     f"Refreshing weights in cache for model {model_name} at path {weights_path} ..."
@@ -269,7 +275,7 @@ def download_peft_weights():
                 peft_config = peft_dict["peft_config"]
                 peft_type = peft_dict["peft_type"]
 
-                weights_path = get_weights_path(peft_model_id)
+                weights_path = self.__get_weights_path(peft_model_id)
                 refresh_cache_if_needed(peft_model_id)
                 ff_revision, ff_revision_file, latest_revision = (
                     self.__get_revision_hashes(peft_model_id, weights_path)
@@ -295,7 +301,6 @@ def download_peft_weights():
                     gc.collect()
                     torch.cuda.empty_cache()
 
-        self.weights_path = get_weights_path(self.model_name)
         download_llm_weights()
         download_peft_weights()
 
@@ -306,11 +311,6 @@ def download_hf_tokenizer_if_needed(self):
         print("Loading tokenizer...")
 
         # Use local cache, or download new version
-        self.tokenizer_path = os.path.join(
-            os.path.expanduser(self.cache_path),
-            "tokenizers",
-            self.model_name.lower(),
-        )
         if self.refresh_cache:
             print(
                 f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..."
@@ -353,17 +353,20 @@ def upload_hf_model(self, new_model_id: str, private: bool = False):
         :param new_model_id: The new model ID for the Hugging Face Hub.
         :param private: Whether to upload the model as a private model.
         """
+        # Ensure Hugging Face CLI is logged in
+        if not HfFolder.get_token():
+            raise RuntimeError("Hugging Face token not found. Please login using `huggingface-cli login`.")
+        
         print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}")
         print("Tokenizer path is: ", self.tokenizer_path)
 
         # Initialize a new Hugging Face model instance
         hf_model = AutoModelForCausalLM.from_config(self.hf_config)
-        weights_path = self.weights_path
         print(f"Model class is: {self.model_class}")
 
         # Load FlexFlow weights into the Hugging Face model instance
         try:
-            self.model_class.load_weights_into_hf_model(hf_model, weights_path)
+            self.model_class.load_weights_into_hf_model(hf_model, self.weights_path)
         except Exception as e:
             print(f"Error loading weights into model: {e}")
             return
@@ -382,13 +385,6 @@ def upload_hf_model(self, new_model_id: str, private: bool = False):
         if os.path.exists(rev_sha_path):
             os.remove(rev_sha_path)
 
-        # Ensure Hugging Face CLI is logged in
-        if not HfFolder.get_token():
-            print(
-                "Hugging Face token not found. Please login using `huggingface-cli login`."
-            )
-            return
-
         # Upload the model
         api = HfApi()
         print(f"Uploading processed model to Hugging Face Hub: {new_model_id}")
diff --git a/tests/upload_test.sh b/tests/upload_test.sh
new file mode 100644
index 0000000000..c6b4e3d0f6
--- /dev/null
+++ b/tests/upload_test.sh
@@ -0,0 +1,60 @@
+#! /usr/bin/env bash
+set -x
+set -e
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}"
+
+# Token to access private huggingface models (e.g. LLAMA-2)
+HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none}
+if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then
+    huggingface-cli login --token "$HUGGINGFACE_TOKEN"
+fi
+
+# Create test prompt file
+mkdir -p ../inference/prompt
+echo '["San Francisco, officially the City and County of San Francisco, is a "]' > ../inference/prompt/test_upload.json
+
+# Create output folder
+mkdir -p ../inference/output
+mkdir -p ../inference/configs
+
+# Enable backtrace in case we run into a segfault or assertion failure
+export LEGION_BACKTRACE=1
+
+# Create config files
+cat > ../inference/configs/llama_small.json <<EOF
+{
+    "num_gpus": 1,
+    "memory_per_gpu": 8000,
+    "zero_copy_memory_per_node": 20000,
+    "num_cpus": 4,
+    "legion_utility_processors": 4,
+    "offload": false,
+    "fusion": true,
+    "llm_model": "JackFram/llama-160m",
+    "full_precision": false,
+    "prompt": "../../inference/prompt/test_upload.json",
+    "output_file": "../../inference/output/original_llama_small.txt"
+}
+EOF
+cat > ../inference/configs/llama_small_upload.json <<EOF
+{
+    "num_gpus": 1,
+    "memory_per_gpu": 8000,
+    "zero_copy_memory_per_node": 20000,
+    "num_cpus": 4,
+    "legion_utility_processors": 4,
+    "offload": false,
+    "fusion": true,
+    "llm_model": "goliaro/test-llama",
+    "full_precision": false,
+    "prompt": "../../inference/prompt/test_upload.json",
+    "output_file": "../../inference/output/upload_llama_small.txt"
+}
+EOF
+python ../inference/python/incr_decoding.py -config-file ../inference/configs/llama_small.json
+python -c "from huggingface_hub import HfApi; api = HfApi(); api.delete_repo('goliaro/test-llama')" || true
+python ../inference/utils/upload_hf_model.py JackFram/llama-160m --new-model-id goliaro/test-llama
+python ../inference/python/incr_decoding.py -config-file ../inference/configs/llama_small_upload.json
+diff <(tail -n +3 "../../inference/output/original_llama_small.txt") <(tail -n +3 "../../inference/output/upload_llama_small.txt")

From a899501e87eaa8845b0b016a1b0f3743fe591f10 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 12 Jun 2024 08:38:50 +0000
Subject: [PATCH 53/55] add code to save peft weights to file

---
 include/flexflow/flexflow_c.h          |   6 +
 include/flexflow/model.h               |   2 +-
 include/flexflow/ops/lora_linear.h     |  12 ++
 include/flexflow/request_manager.h     |   7 +
 inference/MODEL_WEIGHTS.md             |  28 ----
 inference/peft/peft.cc                 |   3 +
 inference/utils/download_peft_model.py |   2 +-
 python/flexflow/core/flexflow_cffi.py  |   6 +
 src/c/flexflow_c.cc                    |  23 +++-
 src/ops/lora_linear.cc                 | 170 +++++++++++++++++++++++++
 src/runtime/inference_manager.cc       |  56 ++++++++
 src/runtime/model.cc                   |  16 +++
 src/runtime/request_manager.cc         |   8 ++
 13 files changed, 308 insertions(+), 31 deletions(-)
 delete mode 100644 inference/MODEL_WEIGHTS.md

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 97a382ee8b..c2c6ec66ce 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -1011,6 +1011,12 @@ void flexflow_request_manager_start_background_server(
 void flexflow_request_manager_terminate_background_server(
     flexflow_request_manager_t handle_);
 
+void flexflow_request_manager_save_peft_weights(
+    flexflow_request_manager_t handle_,
+    flexflow_model_t model_handle_,
+    flexflow_peft_model_id_t peft_model_id_,
+    char const *destination_folder);
+
 // -----------------------------------------------------------------------
 // InferenceManager
 // -----------------------------------------------------------------------
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 099e2209e4..5f8c3a2de6 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -129,7 +129,7 @@ enum TaskIDs {
   LINEAR_BWD2_TASK_ID,
   LINEAR_UPD_TASK_ID,
   LORA_LINEAR_INIT_TASK_ID,
-  LORA_LINEAR_REG_TASK_ID,
+  LORA_LINEAR_SAVE_WEIGHTS_TASK_ID,
   LORA_LINEAR_INF_TASK_ID,
   LORA_LINEAR_PEFT_BWD_TASK_ID,
   FLAT_INIT_TASK_ID,
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index 9e83c3f90e..48d130a230 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -41,6 +41,13 @@ class LoraLinear : public Op {
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
+  void save_peft_weights(FFModel const &ff,
+                         PEFTModelID const &model_id,
+                         int rank,
+                         std::string const &destination_folder,
+                         std::vector<ParallelTensor> const &batch_inputs,
+                         std::vector<ParallelTensor> const &batch_outputs,
+                         MachineView const *mv = nullptr);
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &,
                               std::vector<ParallelTensor> const &,
@@ -69,6 +76,11 @@ class LoraLinear : public Op {
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void
+      save_peft_weights_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index fe0e4b2f9d..06d5cfc43d 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -36,6 +36,9 @@ class InferenceManager {
   static InferenceManager *get_inference_manager();
   void compile_model_and_allocate_buffer(FFModel *model);
   void init_operators_inference(FFModel *model);
+  void save_peft_weights(FFModel *model,
+                         PEFTModelID const &model_id,
+                         std::string const &destination_folder);
   Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc);
   Legion::FutureMap
       inference(FFModel *model, int index, BatchConfigFuture const &bc);
@@ -158,6 +161,10 @@ class RequestManager {
 
   FFModel *get_ssm_model(int model_id);
 
+  void save_peft_weights(FFModel *model,
+                         PEFTModelID const &model_id,
+                         std::string const &destination_folder);
+
   void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
diff --git a/inference/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md
deleted file mode 100644
index d78fb37be9..0000000000
--- a/inference/MODEL_WEIGHTS.md
+++ /dev/null
@@ -1,28 +0,0 @@
-To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we first load the model and modify the tensor names to match SpecInfer's convention, and then convert these tensors to numpy arrays to store them in binary files.
-
-```python
-from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
-
-for name, params in model.named_parameters():
-    for name, params in model.named_parameters():
-    name = (
-        name.replace(".", "_")
-        .replace("self_attn", "attention")
-        .replace("q_proj", "wq")
-        .replace("k_proj", "wk")
-        .replace("v_proj", "wv")
-        .replace("o_proj", "wo")
-        .replace("mlp", "feed_forward")
-        .replace("gate_proj", "w1")
-        .replace("down_proj", "w2")
-        .replace("up_proj", "w3")
-        .replace("input_layernorm", "attention_norm")
-        .replace("post_attention_layernorm", "ffn_norm")
-        .replace("embed_tokens", "tok_embeddings")
-        .replace("lm_head", "output")
-        .replace("model_", "")
-    )
-    params.detach().cpu().numpy().tofile('weights/llama_7B_weights/' + name)
-```
-
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index f800b7f17c..26643c5d5f 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -352,6 +352,9 @@ void FlexFlow::top_level_task(Task const *task,
     future.get_void_result();
   }
 
+  // rm->save_peft_weights(&model, *peft_model_id,
+  // std::string("/root/.cache/flexflow/finetuned_weights"));
+
   if (peft_model_id != nullptr) {
     free(peft_model_id);
   }
diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
index 596612d8d7..dca714818f 100644
--- a/inference/utils/download_peft_model.py
+++ b/inference/utils/download_peft_model.py
@@ -6,7 +6,7 @@
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--base_model_name", type=str, help="Name of the model to download"
+        "--base_model_name", type=str, required=True, help="Name of the model to download"
     )
     parser.add_argument(
         "peft_model_ids", type=str, nargs="+", help="Name of the PEFT model(s) to download"
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index aa414f74d7..f415d71515 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -1624,6 +1624,12 @@ def start_server(self, model):
 
     def stop_server(self):
         return ffc().flexflow_request_manager_terminate_background_server(self.handle)
+    
+    def save_peft_weights(self, model, peft_model_id, destination_folder):
+        c_destination_folder = get_c_name(destination_folder)
+        return ffc().flexflow_request_manager_save_peft_weights(
+            self.handle, model.handle, peft_model_id.handle, c_destination_folder
+        )
 
 
 # -----------------------------------------------------------------------
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 43fcd55a02..be9c7cc5b3 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2722,6 +2722,27 @@ void flexflow_request_manager_terminate_background_server(
   handle->terminate_background_server();
 }
 
+void flexflow_request_manager_save_peft_weights(
+    flexflow_request_manager_t handle_,
+    flexflow_model_t model_handle_,
+    flexflow_peft_model_id_t peft_model_id_,
+    char const *destination_folder) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_);
+  PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_id_);
+  assert(peft_model_id != nullptr && "PEFT model ID cannot be nullptr");
+  assert(destination_folder != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  std::string const destination_folder_str(destination_folder);
+  DEBUG_PRINT("[RequestManager] save peft weights %p %p %p %d %s",
+              handle,
+              model_handle,
+              peft_model_id,
+              destination_folder);
+  handle->save_peft_weights(
+      model_handle, *peft_model_id, destination_folder_str);
+}
+
 // -----------------------------------------------------------------------
 // InferenceManager
 // -----------------------------------------------------------------------
@@ -2846,7 +2867,7 @@ flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) {
 }
 
 flexflow_peft_model_id_t flexflow_peft_model_id_no_id() {
-  PEFTModelID *handle = const_cast<PEFTModelID*>(&PEFTModelID::NO_ID);
+  PEFTModelID *handle = const_cast<PEFTModelID *>(&PEFTModelID::NO_ID);
   DEBUG_PRINT("[PEFTModelID] new %p", handle);
   return FFCObjectWrapper::wrap(handle);
 }
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 95c60d2531..758879b9c1 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -274,6 +274,176 @@ void LoraLinear::init_inference(
   set_opmeta_from_futuremap_inference(ff, fm, output_tensor);
 }
 
+struct LoraLinearSaveWeightsInfo {
+  LoraLinear const *lora;
+  PEFTModelID model_id;
+  int rank;
+  std::string destination_folder;
+};
+
+void LoraLinear::save_peft_weights(
+    FFModel const &ff,
+    PEFTModelID const &model_id,
+    int rank,
+    std::string const &destination_folder,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
+  // assert(check_output_input_weight_same_machine_view());
+  // output is considered as an input to allow in-place optimization
+  ParallelTensor output_tensor = batch_outputs[0];
+  parallel_is = output_tensor->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = &output_tensor->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_inference(ff, argmap, output_tensor);
+  LoraLinearSaveWeightsInfo info;
+  info.lora = this;
+  info.model_id = model_id;
+  info.rank = rank;
+  info.destination_folder = destination_folder;
+  IndexLauncher launcher(LORA_LINEAR_SAVE_WEIGHTS_TASK_ID,
+                         parallel_is,
+                         TaskArgument(&info, sizeof(LoraLinearSaveWeightsInfo)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+}
+
+template <typename DT>
+void save_peft_to_file(DT const *weight_ptr,
+                       size_t size,
+                       std::string filepath) {
+  std::ofstream out(filepath, std::ios::binary);
+  // Check if the file was opened successfully
+  if (!out || !out.is_open() || !out.good()) {
+    printf("Could not open file: %s\n", filepath.c_str());
+  }
+  assert(out && out.is_open() && out.good() &&
+         "can't write to lora weight file path");
+  std::vector<DT> host_array(size);
+  copy_tensor_dev_to_host(weight_ptr, host_array.data(), size);
+
+  size_t target_data_size = sizeof(DT) * size;
+  out.write((char *)host_array.data(), target_data_size);
+
+  size_t out_written_size = out.tellp();
+  if (out_written_size != target_data_size) {
+    printf("save weight data error: %lu, %lu, %lu\n",
+           out_written_size,
+           target_data_size,
+           sizeof(DT));
+    assert(false);
+  }
+  out.close();
+}
+
+void LoraLinear::save_peft_weights_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  LoraLinearSaveWeightsInfo const *info =
+      static_cast<LoraLinearSaveWeightsInfo const *>(task->args);
+  LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
+  LoraLinear const *lora = info->lora;
+
+  // get shard id
+  int shard_id = task->index_point.point_data[0];
+
+  // get dimensions and sizes
+  int rank = info->rank;
+  int num_dims = lora->inputs[0]->num_dims;
+  int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree;
+  int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree;
+  int w0_num_elements = rank * in_dim;
+  int w1_num_elements = rank * out_dim;
+
+  // get data type
+  DataType dt = m->input_type[0];
+  assert(dt == m->input_type[1]);
+  assert(dt == m->output_type[0]);
+  assert(dt == lora->inputs[0]->data_type);
+  assert(dt == lora->inputs[1]->data_type);
+  assert(dt == lora->outputs[0]->data_type);
+
+  // get output filepaths
+  assert(info->destination_folder.length() > 0 &&
+         "Destination folder is not set");
+  struct stat st = {0};
+  assert(stat(info->destination_folder.c_str(), &st) == 0 &&
+         (st.st_mode & S_IFDIR) && "Destination folder does not exist");
+  assert(lora->name != nullptr &&
+         "Layer name is not set, cannot determine weights location");
+  std::string lora_layername = std::string(lora->name);
+  std::string searchString = "lora";
+  size_t found = lora_layername.find(searchString);
+  if (found == std::string::npos) {
+    std::cout << "LoraLinear layer name not in the right format (does not "
+                 "contain word 'lora')"
+              << std::endl;
+    assert(false);
+  }
+  std::string lora_layername_substr =
+      lora_layername.substr(0, found + searchString.length());
+  std::string w0_filepath =
+      join_path({info->destination_folder,
+                 lora_layername_substr + "_A.weight" + ".shard_" +
+                     std::to_string(shard_id)});
+  std::string w1_filepath = join_path(
+      {info->destination_folder, lora_layername_substr + "_B.weight"});
+
+  // check handle to peft weights
+  assert(m->model_weights.find(info->model_id) != m->model_weights.end());
+
+  // save weights to file
+  std::cout << "Saving LORA weight "
+            << lora_layername_substr + "_A.weight" + ".shard_" +
+                   std::to_string(shard_id)
+            << ", size: " << w0_num_elements << ", shard: " << shard_id
+            << std::endl;
+  if (dt == DT_FLOAT) {
+    save_peft_to_file((float *)m->model_weights[info->model_id].w0_ptr,
+                      w0_num_elements,
+                      w0_filepath);
+  } else if (dt == DT_HALF) {
+    save_peft_to_file((half *)m->model_weights[info->model_id].w0_ptr,
+                      w0_num_elements,
+                      w0_filepath);
+  } else {
+    assert(false && "Data type not supported");
+  }
+  if (shard_id == 0) {
+    std::cout << "Saving LORA weight " << lora_layername_substr + "_B.weight"
+              << ", size: " << w1_num_elements << ", shard: " << shard_id
+              << std::endl;
+    if (dt == DT_FLOAT) {
+      save_peft_to_file((float *)m->model_weights[info->model_id].w1_ptr,
+                        w1_num_elements,
+                        w1_filepath);
+    } else if (dt == DT_HALF) {
+      save_peft_to_file((float *)m->model_weights[info->model_id].w1_ptr,
+                        w1_num_elements,
+                        w1_filepath);
+    } else {
+      assert(false && "Data type not supported");
+    }
+  }
+}
+
 template <typename DT>
 void load_peft_from_file(
     DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) {
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index cc967b0cfe..bc71866372 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -17,13 +17,16 @@
 #include "flexflow/graph.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/fused.h"
+#include "flexflow/ops/lora_linear.h"
 #include "flexflow/ops/noop.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 #include "flexflow/request_manager.h"
+#include <filesystem>
 
 namespace FlexFlow {
 
 using namespace Legion;
+namespace fs = std::filesystem;
 
 LegionRuntime::Logger::Category log_inf_mgr("InferenceManager");
 LegionRuntime::Logger::Category log_offload("Offloading");
@@ -378,6 +381,59 @@ void InferenceManager::init_operators_inference(FFModel *model) {
   }
 }
 
+void InferenceManager::save_peft_weights(
+    FFModel *model,
+    PEFTModelID const &model_id,
+    std::string const &destination_folder) {
+  // check that peft model id exists and get rank
+  assert(model->peft_configs.find(model_id) != model->peft_configs.end() &&
+         "PEFT model id is invalid");
+  // get rank
+  int rank = model->peft_configs[model_id].rank;
+  assert(rank > 0 && "Rank must be greater than 0");
+  // Delete the folder if it exists, create it
+  try {
+    if (fs::exists(destination_folder) &&
+        fs::is_directory(destination_folder)) {
+      fs::remove_all(destination_folder);
+    }
+  } catch (fs::filesystem_error const &e) {
+    std::cout << "Error deleting folder: " << e.what() << std::endl;
+  }
+  try {
+    // Create the folder
+    fs::create_directory(destination_folder);
+  } catch (fs::filesystem_error const &e) {
+    std::cout << "Error creating folder: " << e.what() << std::endl;
+  }
+  for (size_t o = 0; o < model->operators.size(); o++) {
+    Op *op = model->operators[o];
+    if (op->op_type != OP_LORA) {
+      continue;
+    }
+    std::vector<ParallelTensor> inputs(op->numInputs);
+    std::vector<ParallelTensor> outputs(op->numOutputs);
+    for (int i = 0; i < op->numInputs; i++) {
+      assert(op->inputs[i] != nullptr);
+      assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      assert(tensor_buffer[op->inputs[i]].size() > 0);
+      inputs[i] = tensor_buffer[op->inputs[i]][0];
+      assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+    }
+    assert(op->numOutputs > 0);
+    for (int i = 0; i < op->numOutputs; i++) {
+      assert(op->outputs[i] != nullptr);
+      assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      assert(tensor_buffer[op->outputs[i]].size() > 0);
+      outputs[i] = tensor_buffer[op->outputs[i]][0];
+      assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+    }
+    LoraLinear *lora = static_cast<LoraLinear *>(model->operators[o]);
+    lora->save_peft_weights(
+        *model, model_id, rank, destination_folder, inputs, outputs);
+  }
+}
+
 FutureMap InferenceManager::inference(FFModel *model,
                                       int index,
                                       BatchConfig const &bc) {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index b28d3d7701..258f3c3467 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6734,6 +6734,22 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<LoraLinear::peft_bwd_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_SAVE_WEIGHTS_TASK_ID,
+                                   "LoraLinear Save PEFT Weights");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LoraLinear::save_peft_weights_task>(
+          registrar, "LoraLinear Save PEFT Weights Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LoraLinear::save_peft_weights_task>(
+          registrar);
+    }
+  }
 
   // NoOp
   {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e3c6e7c6f3..37ba0dd3b5 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2783,6 +2783,14 @@ bool is_peft_operator_type(OperatorType type) {
   }
 }
 
+void RequestManager::save_peft_weights(FFModel *model,
+                                       PEFTModelID const &model_id,
+                                       std::string const &destination_folder) {
+  // Save the weights of the model
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  im->save_peft_weights(model, model_id, destination_folder);
+}
+
 /*static*/
 void RequestManager::serve_incr_decoding(FFModel *llm) {
 

From 9eb58c3cd74b73c0166034908861753034fd3d4d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 12 Jun 2024 16:54:57 +0000
Subject: [PATCH 54/55] fix print

---
 src/c/flexflow_c.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index be9c7cc5b3..d43eee7a1a 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2734,7 +2734,7 @@ void flexflow_request_manager_save_peft_weights(
   assert(destination_folder != nullptr &&
          "Cannot convert nullptr char * to std::string");
   std::string const destination_folder_str(destination_folder);
-  DEBUG_PRINT("[RequestManager] save peft weights %p %p %p %d %s",
+  DEBUG_PRINT("[RequestManager] save peft weights %p %p %p %s",
               handle,
               model_handle,
               peft_model_id,

From f00af8beb6a64b6ded0cc041ef7a83a30fdbf625 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 12 Jun 2024 17:34:07 +0000
Subject: [PATCH 55/55] mv fix

---
 inference/peft/peft.cc | 5 +++--
 src/ops/lora_linear.cc | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 26643c5d5f..2963879550 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -352,8 +352,9 @@ void FlexFlow::top_level_task(Task const *task,
     future.get_void_result();
   }
 
-  // rm->save_peft_weights(&model, *peft_model_id,
-  // std::string("/root/.cache/flexflow/finetuned_weights"));
+  rm->save_peft_weights(&model,
+                        *peft_model_id,
+                        std::string("/root/.cache/flexflow/finetuned_weights"));
 
   if (peft_model_id != nullptr) {
     free(peft_model_id);
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 758879b9c1..0e9ab76bd4 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -303,7 +303,7 @@ void LoraLinear::save_peft_weights(
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
-  MachineView const *view = &output_tensor->machine_view;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
   size_t machine_view_hash = view->hash();
   set_argumentmap_for_inference(ff, argmap, output_tensor);
   LoraLinearSaveWeightsInfo info;