Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Jul 28, 2023
1 parent 999d4ac commit d58c28b
Show file tree
Hide file tree
Showing 7 changed files with 249 additions and 255 deletions.
2 changes: 1 addition & 1 deletion include/flexflow/ffconst.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ enum OperatorType {
OP_INVALID,
};

enum ModelType { UNKNOWN, LLAMA, OPT, FALCON };
enum ModelType { UNKNOWN = 3001, LLAMA = 3002, OPT = 3003, FALCON = 3004 };

enum PMParameter {
PM_OP_TYPE, // AnyOp
Expand Down
39 changes: 14 additions & 25 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -814,44 +814,33 @@ void flexflow_beam_search_batch_config_destroy(
// RequestManager
// -----------------------------------------------------------------------

flexflow_request_manager_t flexflow_request_manager_create(void);
flexflow_request_manager_t flexflow_request_manager_get_request_manager(void);

void flexflow_request_manager_destroy(flexflow_request_manager_t handle);
// void flexflow_request_manager_destroy(flexflow_request_manager_t handle_);

long unsigned int flexflow_request_manager_register_new_request(
flexflow_request_manager_t handle,
char const *prompt,
int max_sequence_length);
void flexflow_request_manager_register_tokenizer(
flexflow_request_manager_t handle_,
enum ModelType model_type,
char const *tokenizer_filepath);

void flexflow_request_manager_register_output_filepath(
flexflow_request_manager_t handle_, char const *output_filepath);

// -----------------------------------------------------------------------
// InferenceManager
// -----------------------------------------------------------------------

flexflow_inference_manager_t
flexflow_inference_manager_create(flexflow_config_t config_handle,
int max_num_tokens_per_batch);
flexflow_inference_manager_get_inference_manager(void);

void flexflow_inference_manager_destroy(flexflow_inference_manager_t handle);
// void flexflow_inference_manager_destroy(flexflow_inference_manager_t
// handle_);

void flexflow_inference_manager_compile_model_and_allocate_buffer(
flexflow_inference_manager_t handle, flexflow_model_t model_handle);
flexflow_inference_manager_t handle_, flexflow_model_t model_handle);

void flexflow_inference_manager_init_operators_inference(
flexflow_inference_manager_t handle, flexflow_model_t model_handle);

void flexflow_inference_manager_incr_decoding_loop(
flexflow_inference_manager_t handle,
flexflow_model_t model_handle,
flexflow_request_manager_t rm_handle,
int total_num_requests);

void flexflow_inference_manager_spec_inference_loop(
flexflow_inference_manager_t handle,
flexflow_model_t model_handle,
flexflow_request_manager_t rm_handle,
int total_num_requests,
int num_ssms,
int *ssm_model_ids);
flexflow_inference_manager_t handle_, flexflow_model_t model_handle);

#ifdef __cplusplus
}
Expand Down
5 changes: 0 additions & 5 deletions inference/incr_decoding/incr_decoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,6 @@ void FlexFlow::top_level_task(Task const *task,
RequestManager *rm = RequestManager::get_request_manager();
rm->register_tokenizer(model_type, file_paths.tokenizer_file_path);
rm->register_output_filepath(file_paths.output_file_path);
// InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS);
// RequestManager rm(model_type,
// file_paths.tokenizer_file_path,
// /*verbose*/ verbose,
// file_paths.output_file_path);

FFModel model(ffconfig, ffconfig.cpu_offload);
if (model_type == ModelType::LLAMA) {
Expand Down
34 changes: 16 additions & 18 deletions python/flexflow/core/flexflow_cffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import warnings
import numpy as np
from .flexflow_logger import fflogger
from flexflow.type import ActiMode, RegularizerMode, AggrMode, PoolType, DataType, LossType, CompMode, MetricsType, InferenceMode, OpType, ParameterSyncType, enum_to_int, int_to_enum
from flexflow.type import ActiMode, RegularizerMode, AggrMode, PoolType, DataType, LossType, CompMode, MetricsType, InferenceMode, ModelType, OpType, ParameterSyncType, enum_to_int, int_to_enum
_FF_BUILD_DOCS = bool(os.environ.get('READTHEDOCS') or os.environ.get("FF_BUILD_DOCS"))
if not _FF_BUILD_DOCS:
from .flexflowlib import ffi, flexflow_library
Expand Down Expand Up @@ -2869,34 +2869,32 @@ def __init__(self):
# -----------------------------------------------------------------------

class RequestManager(object):
__slots__ = ['handle', '_handle']
__slots__ = ['handle']
def __init__(self):
self.handle = ffc.flexflow_request_manager_create()
self._handle = ffi.gc(self.handle, ffc.flexflow_request_manager_destroy)
self.handle = ffc.flexflow_request_manager_get_request_manager()
#self._handle = ffi.gc(self.handle, ffc.flexflow_request_manager_destroy)

def flexflow_request_manager_register_new_request(self, prompt, max_sequence_length):
return ffc.flexflow_request_manager_register_new_request(self.handle, prompt, max_sequence_length)
def register_tokenizer(self, model_type, tokenizer_filepath):
c_model_type = enum_to_int(ModelType, model_type)
c_tokenizer_filepath = get_c_name(tokenizer_filepath)
return ffc.flexflow_request_manager_register_tokenizer(self.handle, c_model_type, c_tokenizer_filepath)

def register_output_filepath(self, output_filepath):
c_output_filepath = get_c_name(output_filepath)
return ffc.flexflow_request_manager_register_output_filepath(self.handle, c_output_filepath)

# -----------------------------------------------------------------------
# InferenceManager
# -----------------------------------------------------------------------

class InferenceManager(object):
__slots__ = ['handle', '_handle', 'max_num_tokens_per_batch']
def __init__(self, ffconfig, max_num_tokens_per_batch):
self.max_num_tokens_per_batch = max_num_tokens_per_batch
self.handle = ffc.flexflow_inference_manager_create(ffconfig.handle, max_num_tokens_per_batch)
self._handle = ffi.gc(self.handle, ffc.flexflow_inference_manager_destroy)
__slots__ = ['handle']
def __init__(self):
self.handle = ffc.flexflow_inference_manager_get_inference_manager()
#self._handle = ffi.gc(self.handle, ffc.flexflow_inference_manager_destroy)

def compile_model_and_allocate_buffer(self, model):
ffc.flexflow_inference_manager_compile_model_and_allocate_buffer(self.handle, model.handle)

def init_operators_inference(self, model):
ffc.flexflow_inference_manager_init_operators_inference(self.handle, model.handle)

def incr_decoding_loop(self, model, request_manager, total_num_requests):
ffc.flexflow_inference_manager_incr_decoding_loop(self.handle, model.handle, request_manager.handle, total_num_requests)

def spec_inference_loop(self, model, request_manager, total_num_requests, ssm_model_ids):
c_ssm_model_ids = ffi.new("int[]", ssm_model_ids)
ffc.flexflow_inference_manager_spec_inference_loop(self.handle, model.handle, request_manager.handle, total_num_requests, len(ssm_model_ids), c_ssm_model_ids)
55 changes: 38 additions & 17 deletions python/flexflow/serve/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,27 @@
from transformers import AutoConfig
import sys


class SamplingConfig:
def __init__(self, do_sample = False, temperature=0.9, topp=0.8, topk=1):
def __init__(self, do_sample=False, temperature=0.9, topp=0.8, topk=1):
self.do_sample = False
self.temperature = 0.8
self.topp = 0.6
self.topk = 1


class LLM:
def __init__(self, model_name, data_type="half"):
self.model_name = model_name
self.supported_models = {
"LlamaForCausalLM": FlexFlowLLAMA,
"LLaMAForCausalLM": FlexFlowLLAMA,
"OPTForCausalLM": FlexFlowOPT,
"RWForCausalLM": FlexFlowFalcon # falcon
"LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA),
"LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA),
"OPTForCausalLM": (ModelType.OPT, FlexFlowOPT),
"RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon),
}
self.model_type = self.__get_ff_model_type(model_name)
self.model_type, self.model_class = self.__get_ff_model_type(model_name)
print(self.model_type, self.model_class)
print(type(self.model_type), type(self.model_class))
self.data_type = data_type
self.ffconfig = FFConfig()

Expand All @@ -44,15 +48,17 @@ def __get_ff_model_type(self, model_name):
if next(iter(architectures), None) is not None:
ff_arch = self.supported_models.get(architectures[0])
if ff_arch is None:
print("Huggingface model of type {architectures} is not yet supported by FlexFlow")
print(
"Huggingface model of type {architectures} is not yet supported by FlexFlow"
)
sys.exit(1)
return ff_arch

def compile(
self,
mode = InferenceMode.INC_DECODING_MODE,
sampling_config = SamplingConfig(),
use_full_precision = False,
mode=InferenceMode.INC_DECODING_MODE,
sampling_config=SamplingConfig(),
use_full_precision=False,
max_batch_size=1,
max_seq_length=256,
max_tokens_per_batch=64,
Expand All @@ -67,17 +73,32 @@ def compile(
self.pipeline_parallel_degree = pipeline_parallel_degree
self.ssms = ssms
self.sampling_config = SamplingConfig()
assert((mode == InferenceMode.INC_DECODING_MODE or mode == InferenceMode.BEAM_SEARCH_MODE) == (len(ssms) == 0))

assert (
mode == InferenceMode.INC_DECODING_MODE
or mode == InferenceMode.BEAM_SEARCH_MODE
) == (len(ssms) == 0)

# Create model
self.model = self.model_type(mode, sampling_config, self.ffconfig, max_batch_size, max_seq_length, max_tokens_per_batch, use_full_precision)
self.model = self.model_class(
mode,
sampling_config,
self.ffconfig,
max_batch_size,
max_seq_length,
max_tokens_per_batch,
use_full_precision,
)

# Create request manager
self.rm = RequestManager()
self.rm.register_tokenizer(self.model_type, "tokenizer_file_path")
self.rm.register_output_filepath("output_file_path")

# Create inference manager
#self.im = InferenceManager(self.ffconfig, max_tokens_per_batch)
self.im = InferenceManager()
self.im.compile_model_and_allocate_buffer(self.model)
self.im.init_operators_inference(self.model)

# Create request manager
#self.rm = RequestManager()

assert False and "Not implemented yet"

def generate(self, prompt, sampling=None):
Expand Down
Loading

0 comments on commit d58c28b

Please sign in to comment.