add max new tokens parameter

flexflow · Oct 1, 2024 · d62d9be · d62d9be
1 parent 2906e57
commit d62d9be
Show file tree

Hide file tree

Showing 36 changed files with 176 additions and 129 deletions.
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -87,7 +87,7 @@ class BatchConfig {
       first_token_depth_in_request = 0;
       first_token_offset_in_batch = 0;
       num_tokens_in_batch = 0;
-      max_sequence_length = 0;
+      max_length = 0;
       request_guid = 0;
       prompt_phase = false;
       batch_config_request_id = -1;
@@ -98,7 +98,7 @@ class BatchConfig {
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
     int num_tokens_in_batch;
-    int max_sequence_length;
+    int max_length;
 
     // request id in batch config:
     int batch_config_request_id = -1;

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
@@ -627,7 +627,8 @@ void flexflow_model_generate(flexflow_model_t handle_,
                              enum RequestType *request_types,
                              char const **input_texts,
                              char **output_texts,
-                             int *max_seq_lengths,
+                             int *max_lengths,
+                             int *max_new_tokens_,
                              flexflow_peft_model_id_t *peft_model_ids,
                              char const **dataset_filepaths,
                              int *training_steps,

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -67,7 +67,8 @@ struct Request {
   };
   BatchConfig::RequestGuid guid;
   PEFTModelID peft_model_id = PEFTModelID::NO_ID;
-  int max_sequence_length = 128;
+  int max_length = -1;
+  int max_new_tokens = 128;
   int initial_len;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
@@ -271,7 +271,7 @@ void FlexFlow::top_level_task(Task const *task,
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       Request inference_req;
       inference_req.prompt = text;
-      inference_req.max_sequence_length = 128;
+      inference_req.max_length = 128;
       requests.push_back(inference_req);
       total_num_requests++;
     }

diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
@@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task,
         printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
         Request inference_req;
         inference_req.prompt = text;
-        inference_req.max_sequence_length = 128;
+        inference_req.max_length = 128;
         inference_req.peft_model_id =
             (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
         requests.push_back(inference_req);

diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
@@ -308,7 +308,7 @@ void FlexFlow::top_level_task(Task const *task,
     for (int i = 0; i < 100; i++) {
       Request inference_req;
       inference_req.benchmarking_tokens = 128;
-      inference_req.max_sequence_length = 256;
+      inference_req.max_length = 256;
       inference_req.warmup = true;
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -317,7 +317,7 @@ void FlexFlow::top_level_task(Task const *task,
     Request fine_tuning_req;
     fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
     fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.max_length = 1024;
     fine_tuning_req.warmup = true;
     fine_tuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -361,7 +361,7 @@ void FlexFlow::top_level_task(Task const *task,
       Request fine_tuning_req;
       fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
       fine_tuning_req.benchmarking_tokens = lengths[i];
-      fine_tuning_req.max_sequence_length = lengths[i];
+      fine_tuning_req.max_length = lengths[i];
       fine_tuning_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
       fine_tuning_req.max_training_steps = 1;

diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc
@@ -333,7 +333,7 @@ void FlexFlow::top_level_task(Task const *task,
       // sequence_length);
       Request inference_req;
       inference_req.benchmarking_tokens = prompt.first;
-      inference_req.max_sequence_length = prompt.second + prompt.first;
+      inference_req.max_length = prompt.second + prompt.first;
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
       requests.push_back(inference_req);

diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
@@ -369,7 +369,7 @@ void FlexFlow::top_level_task(Task const *task,
     for (int i = 0; i < 100; i++) {
       Request inference_req;
       inference_req.benchmarking_tokens = 128;
-      inference_req.max_sequence_length = 256;
+      inference_req.max_length = 256;
       inference_req.warmup = true;
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -379,7 +379,7 @@ void FlexFlow::top_level_task(Task const *task,
     Request fine_tuning_req;
     fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
     fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.max_length = 1024;
     fine_tuning_req.warmup = true;
     fine_tuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -443,7 +443,7 @@ void FlexFlow::top_level_task(Task const *task,
     Request fine_tuning_req;
     fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
     fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.max_length = 1024;
     fine_tuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
     fine_tuning_req.max_training_steps = 1000000000;
@@ -473,7 +473,7 @@ void FlexFlow::top_level_task(Task const *task,
         // sequence_length);
         Request inference_req;
         inference_req.benchmarking_tokens = prompt.first;
-        inference_req.max_sequence_length = prompt.second + prompt.first;
+        inference_req.max_length = prompt.second + prompt.first;
         inference_req.peft_model_id =
             (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
         requests.push_back(inference_req);

diff --git a/inference/python/entrypoint/fastapi_incr.py b/inference/python/entrypoint/fastapi_incr.py
@@ -60,32 +60,28 @@ def get_configs():
         # Define sample configs
         ff_init_configs = {
             # required parameters
-            "num_gpus": 4,
-            "memory_per_gpu": 20000,
+            "num_gpus": 2,
+            "memory_per_gpu": 14000,
             "zero_copy_memory_per_node": 40000,
             # optional parameters
             "num_cpus": 4,
             "legion_utility_processors": 4,
             "data_parallelism_degree": 1,
-            "tensor_parallelism_degree": 4,
-            "pipeline_parallelism_degree": 1,
+            "tensor_parallelism_degree": 1,
+            "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 8 * 1024, # 8GB
+            "offload_reserve_space_size": 1024**2,
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
-            "enable_peft": False,
-            "peft_activation_reserve_space_size": 1024, # 1GB
-            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
-            "benchmarking": False,
             "inference_debugging": False,
             "fusion": True,
         }
         llm_configs = {
             # required parameters
-            "llm_model": "meta-llama/Meta-Llama-3.1-8B",
+            "llm_model": "tiiuae/falcon-7b",
             # optional parameters
-            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+            "cache_path": "",
             "refresh_cache": False,
             "full_precision": False,
             "prompt": "",
@@ -106,9 +102,7 @@ async def startup_event():
     configs = SimpleNamespace(**configs_dict)
     ff.init(configs_dict)
 
-    ff_data_type = (
-        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
-    )
+    ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
     llm = ff.LLM(
         configs.llm_model,
         data_type=ff_data_type,
@@ -123,7 +117,7 @@ async def startup_event():
     llm.compile(
         generation_config,
         max_requests_per_batch=1,
-        max_seq_length=2048,
+        max_seq_length=256,
         max_tokens_per_batch=64,
     )
     llm.start_server()

diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py
@@ -138,7 +138,7 @@ async def startup_event():
     )
     llm.compile(
         generation_config,
-        max_requests_per_batch=1,
+        max_requests_per_batch=16,
         max_seq_length=2048,
         max_tokens_per_batch=64,
     )

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
@@ -421,7 +421,7 @@ void FlexFlow::top_level_task(Task const *task,
       // Add inference request
       Request inference_req;
       inference_req.prompt = text;
-      inference_req.max_sequence_length = 128;
+      inference_req.max_length = 128;
       requests.push_back(inference_req);
       total_num_requests++;
     }

diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
@@ -38,9 +38,10 @@
 )
 from flexflow.config import *
 from .flexflowlib import ffi, flexflow_library
-from typing import Union, List
+from typing import Union, List, Optional
+from dataclasses import dataclass
 from peft import LoraConfig
-import json
+import json, math
 
 
 def ffc():
@@ -2049,25 +2050,16 @@ def no_id_handle():
 # Request
 # -----------------------------------------------------------------------
 
-
+@dataclass
 class Request:
     """A class to record the metadata of an inference or finetuning request."""
-
-    def __init__(
-        self,
-        req_type: RequestType,
-        prompt: str = None,
-        max_sequence_length: int = 2048,
-        peft_model_id: PEFTModelID = None,
-        dataset_filepath: str = None,
-        max_training_steps: int = 1,
-    ):
-        self.req_type = req_type
-        self.prompt = prompt
-        self.max_sequence_length = max_sequence_length
-        self.peft_model_id = peft_model_id
-        self.dataset_filepath = dataset_filepath
-        self.max_training_steps = max_training_steps
+    req_type: RequestType
+    prompt: Optional[str] = None
+    max_length: int = -1
+    max_new_tokens: int = 128
+    peft_model_id: Optional[PEFTModelID] = None
+    dataset_filepath: Optional[str] = None
+    max_training_steps: int = 1
 
 
 # -----------------------------------------------------------------------
@@ -4665,19 +4657,23 @@ def get_output_tensor(self, ffmodel, data_type):
         assert ret_val == True
         return np_array
 
-    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 2048):
+    def generate_inf_only(self, prompt_list: List[str], max_length: int = -1, max_new_tokens: int = 128):
+        if max_length != -1 and max_new_tokens != -1:
+            warnings.warn(f"Both `max_new_tokens` (={self.max_new_tokens}) and `max_length`(={self.max_length}) seem to have been set. `max_new_tokens` will take precedence.")
         assert isinstance(prompt_list, list)
         c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
-        max_num_chars = 5 * (max_sequence_length + 100)
+        estimated_max_tokens = math.ceil(max_new_tokens + max([len(prompt.split()) for prompt in prompt_list])*1.5) if max_new_tokens != -1 else max_length
+        max_num_chars = 5 * (estimated_max_tokens + 100)
         c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
         c_output_length_and_tokens = [
-            ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list
+            ffi.new("int[]", estimated_max_tokens + 100) for prompt in prompt_list
         ]
         c_request_types = [
             enum_to_int(RequestType, RequestType.REQ_INFERENCE)
             for prompt in prompt_list
         ]
-        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
+        max_lengths = [max_length for prompt in prompt_list]
+        max_new_tokens_ = [max_new_tokens for prompt in prompt_list]
         peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list]
         dataset_filepaths = [ffi.NULL for prompt in prompt_list]
         training_steps = [0 for prompt in prompt_list]
@@ -4689,7 +4685,8 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 2
             c_request_types,
             c_input_texts,
             c_output_texts,
-            max_sequence_lengths,
+            max_lengths,
+            max_new_tokens_,
             peft_model_ids,
             dataset_filepaths,
             training_steps,
@@ -4726,9 +4723,16 @@ def generate(self, requests_list: List[Request]):
         c_request_types = [
             enum_to_int(RequestType, request.req_type) for request in requests_list
         ]
-        max_sequence_lengths = [
-            request.max_sequence_length for request in requests_list
+        max_lengths = [
+            request.max_length for request in requests_list
+        ]
+        max_new_tokens_ = [
+            request.max_new_tokens for request in requests_list
         ]
+        for i in range(len(requests_list)):
+            if max_lengths[i] != -1 and max_new_tokens_[i] != -1:
+                warnings.warn(f"Both `max_new_tokens` (={max_new_tokens_[i]}) and `max_length`(={max_lengths[i]}) seem to have been set. `max_new_tokens` will take precedence.")
+
         peft_model_ids = [
             (
                 request.peft_model_id
@@ -4752,7 +4756,8 @@ def generate(self, requests_list: List[Request]):
             c_request_types,
             c_input_texts,
             c_output_texts,
-            max_sequence_lengths,
+            max_lengths,
+            max_new_tokens_,
             peft_model_ids,
             dataset_filepaths,
             training_steps,

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
@@ -498,20 +498,25 @@ def compile(
     def generate(
         self,
         requests_or_prompts: Union[str, List[str], Request, List[Request]],
-        max_length: int = 2048,
+        max_length: int = -1,
+        max_new_tokens: int = 128,
     ):
         """Generate tokens based on the input prompt(s)
 
         :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests
         :type requests_or_prompts: Union[str, List[str], Request, List[Request]]
+        :param max_length: The maximum length in tokens of the prompt + generated sequence, defaults to -1 (no maximum length)
+        :type max_length: int, optional
+        :param max_new_tokens: The maximum number of new tokens (excluding the prompt) to generate, defaults to 128
+        :type max_new_tokens: int, optional
         :return: the generation results
         :rtype: GenerationResult
         """
         if type(requests_or_prompts) == str:
             if len(requests_or_prompts) == 0:
                 return None
             return self.model.ffmodel.generate_inf_only(
-                [requests_or_prompts], max_length
+                [requests_or_prompts], max_length, max_new_tokens
             )
         elif type(requests_or_prompts) == Request:
             return self.model.ffmodel.generate(requests_or_prompts)
@@ -520,7 +525,7 @@ def generate(
                 return []
             if type(requests_or_prompts[0]) == str:
                 return self.model.ffmodel.generate_inf_only(
-                    requests_or_prompts, max_length
+                    requests_or_prompts, max_length, max_new_tokens
                 )
             else:
                 print(requests_or_prompts)