diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 0b8a78e94e..42cee2aa80 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -31,11 +31,48 @@ from peft import PeftModel, PeftConfig from huggingface_hub import HfApi, HfFolder, Repository import torch, shutil, hashlib, json, gc, os +from huggingface_hub import HfApi, HfFolder, Repository +import torch, shutil, hashlib, json, gc, os from typing import Union, List import tempfile +class GenerationConfig: + """A class to store the sampling configs.""" + + def __init__( + self, + do_sample: bool = False, + temperature: float = 0.9, + topp: float = 0.8, + topk: int = 1, + ): + """Initialize the sampling configs + + :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False + :type do_sample: bool, optional + :param temperature: The temperature setting, defaults to 0.9 + :type temperature: float, optional + :param topp: The top probabilities (top-p) setting, defaults to 0.8 + :type topp: float, optional + :param topk: The top-k setting, defaults to 1 + :type topk: int, optional + """ + self.do_sample = do_sample + self.temperature = temperature + self.topp = topp + self.topk = topk + + +class GenerationResult: + """A class to store the output of a generation request.""" + + def __init__(self, text: str = None, tokens: list = None): + self.output_text = text + self.output_tokens = tokens + + class _SupportedModels: def __init__( self, @@ -816,6 +853,47 @@ def download_hf_weights_if_needed(self): else: print(f"Loading '{self.peft_model_id}' model weights from the cache...") + def upload_hf_model(self, new_model_id: str, model_path:str, private: bool = False): + """ + Uploads the PEFT model to the Hugging Face Hub, with reverse conversion of weights. + + :param new_model_id: The new model ID for the Hugging Face Hub. + :param model_path: The path where the FlexFlow weights are stored. + :param private: Whether to upload the model as a private model. + """ + print(f"Preparing model for upload to Hugging Face Hub: {new_model_id}") + + # Initialize a new Hugging Face model instance + hf_model = AutoModelForCausalLM.from_config(self.hf_config) + weights_path = self.weights_path + + # Load FlexFlow weights into the Hugging Face model instance + try: + self.model_class.load_weights_into_hf_model(hf_model, weights_path) + except Exception as e: + print(f"Error loading weights into model: {e}") + return + + # Save the model with converted weights to a temporary directory + temp_dir = tempfile.mkdtemp() + hf_model.save_pretrained(temp_dir) + + # Ensure Hugging Face CLI is logged in + if not HfFolder.get_token(): + print("Hugging Face token not found. Please login using `huggingface-cli login`.") + return + + # Upload the model + api = HfApi() + print(f"Uploading processed model to Hugging Face Hub: {new_model_id}") + api.create_repo(repo_id=new_model_id, private=private, exist_ok=True) + api.upload_folder(folder_path=temp_dir, repo_id=new_model_id) + + # Cleanup temporary directory + shutil.rmtree(temp_dir) + + print("Upload completed successfully.") + def process_and_upload_hf_model(self, model_id: str, private: bool = False): """ Processes the PEFT model and uploads it to the Hugging Face Hub.