Skip to content

Commit

Permalink
add max new tokens parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Oct 1, 2024
1 parent 2906e57 commit d62d9be
Show file tree
Hide file tree
Showing 36 changed files with 176 additions and 129 deletions.
4 changes: 2 additions & 2 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class BatchConfig {
first_token_depth_in_request = 0;
first_token_offset_in_batch = 0;
num_tokens_in_batch = 0;
max_sequence_length = 0;
max_length = 0;
request_guid = 0;
prompt_phase = false;
batch_config_request_id = -1;
Expand All @@ -98,7 +98,7 @@ class BatchConfig {
int first_token_depth_in_request;
int first_token_offset_in_batch;
int num_tokens_in_batch;
int max_sequence_length;
int max_length;

// request id in batch config:
int batch_config_request_id = -1;
Expand Down
3 changes: 2 additions & 1 deletion include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,8 @@ void flexflow_model_generate(flexflow_model_t handle_,
enum RequestType *request_types,
char const **input_texts,
char **output_texts,
int *max_seq_lengths,
int *max_lengths,
int *max_new_tokens_,
flexflow_peft_model_id_t *peft_model_ids,
char const **dataset_filepaths,
int *training_steps,
Expand Down
3 changes: 2 additions & 1 deletion include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ struct Request {
};
BatchConfig::RequestGuid guid;
PEFTModelID peft_model_id = PEFTModelID::NO_ID;
int max_sequence_length = 128;
int max_length = -1;
int max_new_tokens = 128;
int initial_len;
int ssm_cache_size = 0;
int llm_cache_size = 0;
Expand Down
2 changes: 1 addition & 1 deletion inference/incr_decoding/incr_decoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ void FlexFlow::top_level_task(Task const *task,
printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
Request inference_req;
inference_req.prompt = text;
inference_req.max_sequence_length = 128;
inference_req.max_length = 128;
requests.push_back(inference_req);
total_num_requests++;
}
Expand Down
2 changes: 1 addition & 1 deletion inference/peft/peft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task,
printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
Request inference_req;
inference_req.prompt = text;
inference_req.max_sequence_length = 128;
inference_req.max_length = 128;
inference_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
requests.push_back(inference_req);
Expand Down
6 changes: 3 additions & 3 deletions inference/peft/peft_bwd_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ void FlexFlow::top_level_task(Task const *task,
for (int i = 0; i < 100; i++) {
Request inference_req;
inference_req.benchmarking_tokens = 128;
inference_req.max_sequence_length = 256;
inference_req.max_length = 256;
inference_req.warmup = true;
inference_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
Expand All @@ -317,7 +317,7 @@ void FlexFlow::top_level_task(Task const *task,
Request fine_tuning_req;
fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
fine_tuning_req.benchmarking_tokens = 1024;
fine_tuning_req.max_sequence_length = 1024;
fine_tuning_req.max_length = 1024;
fine_tuning_req.warmup = true;
fine_tuning_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
Expand Down Expand Up @@ -361,7 +361,7 @@ void FlexFlow::top_level_task(Task const *task,
Request fine_tuning_req;
fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
fine_tuning_req.benchmarking_tokens = lengths[i];
fine_tuning_req.max_sequence_length = lengths[i];
fine_tuning_req.max_length = lengths[i];
fine_tuning_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
fine_tuning_req.max_training_steps = 1;
Expand Down
2 changes: 1 addition & 1 deletion inference/peft/peft_fwd_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ void FlexFlow::top_level_task(Task const *task,
// sequence_length);
Request inference_req;
inference_req.benchmarking_tokens = prompt.first;
inference_req.max_sequence_length = prompt.second + prompt.first;
inference_req.max_length = prompt.second + prompt.first;
inference_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
requests.push_back(inference_req);
Expand Down
8 changes: 4 additions & 4 deletions inference/peft/req_rate_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ void FlexFlow::top_level_task(Task const *task,
for (int i = 0; i < 100; i++) {
Request inference_req;
inference_req.benchmarking_tokens = 128;
inference_req.max_sequence_length = 256;
inference_req.max_length = 256;
inference_req.warmup = true;
inference_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
Expand All @@ -379,7 +379,7 @@ void FlexFlow::top_level_task(Task const *task,
Request fine_tuning_req;
fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
fine_tuning_req.benchmarking_tokens = 1024;
fine_tuning_req.max_sequence_length = 1024;
fine_tuning_req.max_length = 1024;
fine_tuning_req.warmup = true;
fine_tuning_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
Expand Down Expand Up @@ -443,7 +443,7 @@ void FlexFlow::top_level_task(Task const *task,
Request fine_tuning_req;
fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
fine_tuning_req.benchmarking_tokens = 1024;
fine_tuning_req.max_sequence_length = 1024;
fine_tuning_req.max_length = 1024;
fine_tuning_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
fine_tuning_req.max_training_steps = 1000000000;
Expand Down Expand Up @@ -473,7 +473,7 @@ void FlexFlow::top_level_task(Task const *task,
// sequence_length);
Request inference_req;
inference_req.benchmarking_tokens = prompt.first;
inference_req.max_sequence_length = prompt.second + prompt.first;
inference_req.max_length = prompt.second + prompt.first;
inference_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
requests.push_back(inference_req);
Expand Down
24 changes: 9 additions & 15 deletions inference/python/entrypoint/fastapi_incr.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,32 +60,28 @@ def get_configs():
# Define sample configs
ff_init_configs = {
# required parameters
"num_gpus": 4,
"memory_per_gpu": 20000,
"num_gpus": 2,
"memory_per_gpu": 14000,
"zero_copy_memory_per_node": 40000,
# optional parameters
"num_cpus": 4,
"legion_utility_processors": 4,
"data_parallelism_degree": 1,
"tensor_parallelism_degree": 4,
"pipeline_parallelism_degree": 1,
"tensor_parallelism_degree": 1,
"pipeline_parallelism_degree": 2,
"offload": False,
"offload_reserve_space_size": 8 * 1024, # 8GB
"offload_reserve_space_size": 1024**2,
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"enable_peft": False,
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"profiling": False,
"benchmarking": False,
"inference_debugging": False,
"fusion": True,
}
llm_configs = {
# required parameters
"llm_model": "meta-llama/Meta-Llama-3.1-8B",
"llm_model": "tiiuae/falcon-7b",
# optional parameters
"cache_path": os.environ.get("FF_CACHE_PATH", ""),
"cache_path": "",
"refresh_cache": False,
"full_precision": False,
"prompt": "",
Expand All @@ -106,9 +102,7 @@ async def startup_event():
configs = SimpleNamespace(**configs_dict)
ff.init(configs_dict)

ff_data_type = (
ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
)
ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
llm = ff.LLM(
configs.llm_model,
data_type=ff_data_type,
Expand All @@ -123,7 +117,7 @@ async def startup_event():
llm.compile(
generation_config,
max_requests_per_batch=1,
max_seq_length=2048,
max_seq_length=256,
max_tokens_per_batch=64,
)
llm.start_server()
Expand Down
2 changes: 1 addition & 1 deletion inference/python/streamlit/fastapi_incr.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ async def startup_event():
)
llm.compile(
generation_config,
max_requests_per_batch=1,
max_requests_per_batch=16,
max_seq_length=2048,
max_tokens_per_batch=64,
)
Expand Down
2 changes: 1 addition & 1 deletion inference/spec_infer/spec_infer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ void FlexFlow::top_level_task(Task const *task,
// Add inference request
Request inference_req;
inference_req.prompt = text;
inference_req.max_sequence_length = 128;
inference_req.max_length = 128;
requests.push_back(inference_req);
total_num_requests++;
}
Expand Down
59 changes: 32 additions & 27 deletions python/flexflow/core/flexflow_cffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,10 @@
)
from flexflow.config import *
from .flexflowlib import ffi, flexflow_library
from typing import Union, List
from typing import Union, List, Optional
from dataclasses import dataclass
from peft import LoraConfig
import json
import json, math


def ffc():
Expand Down Expand Up @@ -2049,25 +2050,16 @@ def no_id_handle():
# Request
# -----------------------------------------------------------------------


@dataclass
class Request:
"""A class to record the metadata of an inference or finetuning request."""

def __init__(
self,
req_type: RequestType,
prompt: str = None,
max_sequence_length: int = 2048,
peft_model_id: PEFTModelID = None,
dataset_filepath: str = None,
max_training_steps: int = 1,
):
self.req_type = req_type
self.prompt = prompt
self.max_sequence_length = max_sequence_length
self.peft_model_id = peft_model_id
self.dataset_filepath = dataset_filepath
self.max_training_steps = max_training_steps
req_type: RequestType
prompt: Optional[str] = None
max_length: int = -1
max_new_tokens: int = 128
peft_model_id: Optional[PEFTModelID] = None
dataset_filepath: Optional[str] = None
max_training_steps: int = 1


# -----------------------------------------------------------------------
Expand Down Expand Up @@ -4665,19 +4657,23 @@ def get_output_tensor(self, ffmodel, data_type):
assert ret_val == True
return np_array

def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 2048):
def generate_inf_only(self, prompt_list: List[str], max_length: int = -1, max_new_tokens: int = 128):
if max_length != -1 and max_new_tokens != -1:
warnings.warn(f"Both `max_new_tokens` (={self.max_new_tokens}) and `max_length`(={self.max_length}) seem to have been set. `max_new_tokens` will take precedence.")
assert isinstance(prompt_list, list)
c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
max_num_chars = 5 * (max_sequence_length + 100)
estimated_max_tokens = math.ceil(max_new_tokens + max([len(prompt.split()) for prompt in prompt_list])*1.5) if max_new_tokens != -1 else max_length
max_num_chars = 5 * (estimated_max_tokens + 100)
c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
c_output_length_and_tokens = [
ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list
ffi.new("int[]", estimated_max_tokens + 100) for prompt in prompt_list
]
c_request_types = [
enum_to_int(RequestType, RequestType.REQ_INFERENCE)
for prompt in prompt_list
]
max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
max_lengths = [max_length for prompt in prompt_list]
max_new_tokens_ = [max_new_tokens for prompt in prompt_list]
peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list]
dataset_filepaths = [ffi.NULL for prompt in prompt_list]
training_steps = [0 for prompt in prompt_list]
Expand All @@ -4689,7 +4685,8 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 2
c_request_types,
c_input_texts,
c_output_texts,
max_sequence_lengths,
max_lengths,
max_new_tokens_,
peft_model_ids,
dataset_filepaths,
training_steps,
Expand Down Expand Up @@ -4726,9 +4723,16 @@ def generate(self, requests_list: List[Request]):
c_request_types = [
enum_to_int(RequestType, request.req_type) for request in requests_list
]
max_sequence_lengths = [
request.max_sequence_length for request in requests_list
max_lengths = [
request.max_length for request in requests_list
]
max_new_tokens_ = [
request.max_new_tokens for request in requests_list
]
for i in range(len(requests_list)):
if max_lengths[i] != -1 and max_new_tokens_[i] != -1:
warnings.warn(f"Both `max_new_tokens` (={max_new_tokens_[i]}) and `max_length`(={max_lengths[i]}) seem to have been set. `max_new_tokens` will take precedence.")

peft_model_ids = [
(
request.peft_model_id
Expand All @@ -4752,7 +4756,8 @@ def generate(self, requests_list: List[Request]):
c_request_types,
c_input_texts,
c_output_texts,
max_sequence_lengths,
max_lengths,
max_new_tokens_,
peft_model_ids,
dataset_filepaths,
training_steps,
Expand Down
11 changes: 8 additions & 3 deletions python/flexflow/serve/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,20 +498,25 @@ def compile(
def generate(
self,
requests_or_prompts: Union[str, List[str], Request, List[Request]],
max_length: int = 2048,
max_length: int = -1,
max_new_tokens: int = 128,
):
"""Generate tokens based on the input prompt(s)
:param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests
:type requests_or_prompts: Union[str, List[str], Request, List[Request]]
:param max_length: The maximum length in tokens of the prompt + generated sequence, defaults to -1 (no maximum length)
:type max_length: int, optional
:param max_new_tokens: The maximum number of new tokens (excluding the prompt) to generate, defaults to 128
:type max_new_tokens: int, optional
:return: the generation results
:rtype: GenerationResult
"""
if type(requests_or_prompts) == str:
if len(requests_or_prompts) == 0:
return None
return self.model.ffmodel.generate_inf_only(
[requests_or_prompts], max_length
[requests_or_prompts], max_length, max_new_tokens
)
elif type(requests_or_prompts) == Request:
return self.model.ffmodel.generate(requests_or_prompts)
Expand All @@ -520,7 +525,7 @@ def generate(
return []
if type(requests_or_prompts[0]) == str:
return self.model.ffmodel.generate_inf_only(
requests_or_prompts, max_length
requests_or_prompts, max_length, max_new_tokens
)
else:
print(requests_or_prompts)
Expand Down
Loading

0 comments on commit d62d9be

Please sign in to comment.