flexflow · april-yyt · Feb 13, 2024 · Feb 13, 2024 · Feb 16, 2024 · Feb 16, 2024
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
@@ -1014,6 +1014,12 @@ void flexflow_request_manager_start_background_server(
 void flexflow_request_manager_terminate_background_server(
     flexflow_request_manager_t handle_);
 
+void flexflow_request_manager_save_peft_weights(
+    flexflow_request_manager_t handle_,
+    flexflow_model_t model_handle_,
+    flexflow_peft_model_id_t peft_model_id_,
+    char const *destination_folder);
+
 // -----------------------------------------------------------------------
 // InferenceManager
 // -----------------------------------------------------------------------

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -129,7 +129,7 @@ enum TaskIDs {
   LINEAR_BWD2_TASK_ID,
   LINEAR_UPD_TASK_ID,
   LORA_LINEAR_INIT_TASK_ID,
-  LORA_LINEAR_REG_TASK_ID,
+  LORA_LINEAR_SAVE_WEIGHTS_TASK_ID,
   LORA_LINEAR_INF_TASK_ID,
   LORA_LINEAR_PEFT_BWD_TASK_ID,
   FLAT_INIT_TASK_ID,

diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
@@ -41,6 +41,13 @@ class LoraLinear : public Op {
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
+  void save_peft_weights(FFModel const &ff,
+                         PEFTModelID const &model_id,
+                         int rank,
+                         std::string const &destination_folder,
+                         std::vector<ParallelTensor> const &batch_inputs,
+                         std::vector<ParallelTensor> const &batch_outputs,
+                         MachineView const *mv = nullptr);
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &,
                               std::vector<ParallelTensor> const &,
@@ -69,6 +76,11 @@ class LoraLinear : public Op {
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void
+      save_peft_weights_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -36,6 +36,9 @@ class InferenceManager {
   static InferenceManager *get_inference_manager();
   void compile_model_and_allocate_buffer(FFModel *model);
   void init_operators_inference(FFModel *model);
+  void save_peft_weights(FFModel *model,
+                         PEFTModelID const &model_id,
+                         std::string const &destination_folder);
   Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc);
   Legion::FutureMap
       inference(FFModel *model, int index, BatchConfigFuture const &bc);
@@ -161,6 +164,10 @@ class RequestManager {
 
   FFModel *get_ssm_model(int model_id);
 
+  void save_peft_weights(FFModel *model,
+                         PEFTModelID const &model_id,
+                         std::string const &destination_folder);
+
   void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);

diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
@@ -372,6 +372,10 @@ void FlexFlow::top_level_task(Task const *task,
     future.get_void_result();
   }
 
+  rm->save_peft_weights(&model,
+                        *peft_model_id,
+                        std::string("/root/.cache/flexflow/finetuned_weights"));
+
   if (peft_model_id != nullptr) {
     free(peft_model_id);
   }

diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
@@ -25,8 +25,22 @@ def get_configs():
         type=str,
         default="",
     )
-    args = parser.parse_args()
+    parser.add_argument(
+        "--publish-peft-with-id", 
+        help="The Hugging Face model ID to upload the trained model with",
+        type=str, 
+        default=""
+    )
 
+    args = parser.parse_args()
+    publish_peft_with_id = args.publish_peft_with_id
+    if len(publish_peft_with_id) == 0:
+        print(
+            "Please pass a --publish-peft-with-id if you want to upload the trained model"
+        )
+    else:
+        print(f"The trained model will be uploaded with id: {publish_peft_with_id}")
+
     # Load configs from JSON file (if specified)
     if len(args.config_file) > 0:
         if not os.path.isfile(args.config_file):
@@ -67,18 +81,19 @@ def get_configs():
             "inference_peft_model_id": "goliaro/llama-160m-lora",
             "finetuning_peft_model_id": "goliaro/llama-160m-lora",
             # optional parameters
-            "cache_path": "",
+            "cache_path": "~/.cache/flexflow",
             "refresh_cache": False,
             "full_precision": True,
             "prompt": "",
             "finetuning_dataset": os.path.join(
                 os.path.dirname(os.path.abspath(__file__)),
                 "../prompt/peft_dataset.json",
             ),
-            "output_file": "",
+            "output_file": ""
         }
         # Merge dictionaries
         ff_init_configs.update(model_configs)
+        ff_init_configs["publish_peft_with_id"] = publish_peft_with_id
         return ff_init_configs
 
 
@@ -98,7 +113,7 @@ def main():
         data_type=ff_data_type,
         cache_path=configs.cache_path,
         refresh_cache=configs.refresh_cache,
-        output_file=configs.output_file,
+        output_file=configs.output_file
     )
     # Add inference and/or finetuning lora
     lora_inference_config = None
@@ -146,6 +161,8 @@ def main():
     )
 
     llm.start_server()
+
+    print(f"LLM model class is: {llm.model_class}")
 
     requests = []
     # Serving
@@ -173,9 +190,17 @@ def main():
         requests.append(finetuning_request)
 
     llm.generate(requests)
-
+    
     llm.stop_server()
-
+
+    # upload the model back to huggingface after finetuning
+    # the model format would be converted from flexflow format back to huggingface format
+    if len(configs.publish_peft_with_id) > 0:
+        print(
+            f"Done training! Uploading the model to HF hub with id: {configs.publish_peft_with_id}..."
+        )
+        llm.upload_peft_model(configs.publish_peft_with_id, private=True)
+
 
 if __name__ == "__main__":
     print("flexflow PEFT example")