Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python interface for inference (part 2) #893

Merged
merged 35 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
999d4ac
add argmax, add default args to test file
goliaro Jul 27, 2023
d58c28b
updates
goliaro Jul 28, 2023
3793ef5
comment out print
goliaro Jul 28, 2023
65229ff
updates
goliaro Jul 28, 2023
1ae93c0
added code to get configs and weights from hf
goliaro Jul 29, 2023
f9ce99c
added FileDataLoader to cffi
goliaro Jul 29, 2023
732cc5e
remove aggressive reformatting
goliaro Jul 29, 2023
f9a3f6b
update
goliaro Jul 29, 2023
7ac00b0
fix
goliaro Jul 29, 2023
e21de3b
add code to load weights from python
goliaro Jul 30, 2023
5c8f2d2
fix half precision weight loading from python
goliaro Jul 30, 2023
3b1a795
fixed loading weights
goliaro Jul 30, 2023
6537c00
fixed loading weights
goliaro Jul 30, 2023
225f049
checkpoint
goliaro Jul 30, 2023
3035099
generation from python now works
goliaro Jul 30, 2023
8bb5e33
make it easier to set flags needed to run native python
goliaro Jul 30, 2023
b6d6bfa
downloading tokenizers from hf
goliaro Jul 30, 2023
406b548
add support for opt
goliaro Jul 31, 2023
0d628a1
implement falcon
goliaro Jul 31, 2023
b0db33d
add support for multiple prompts and prompts from json file
goliaro Jul 31, 2023
4a16fb3
implement speculative inference
goliaro Jul 31, 2023
446bcdd
finished specinfer implementation
goliaro Jul 31, 2023
37c093d
Merge branch 'inference' into python_inference
goliaro Jul 31, 2023
8c9b0ea
updated arguments parsing
goliaro Jul 31, 2023
a63fa76
remove unnecessary args from compile func
goliaro Jul 31, 2023
840e212
.
goliaro Jul 31, 2023
9a6e5db
update interface examples
goliaro Aug 1, 2023
f3bbefa
fix ssm bug
goliaro Aug 1, 2023
30f5c0d
Merge branch 'inference' into python_inference
goliaro Aug 1, 2023
a07ff22
fix fusion-related bugs
goliaro Aug 1, 2023
59a3669
standardize argument parsing in python examples
goliaro Aug 2, 2023
135cbea
docstrings
goliaro Aug 2, 2023
452b117
update
goliaro Aug 2, 2023
2c8eb85
moved c++ inference tests
goliaro Aug 2, 2023
9238f11
fix
goliaro Aug 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -321,11 +321,13 @@ list(APPEND FLEXFLOW_INCLUDE_DIRS
file(GLOB_RECURSE FLEXFLOW_HDR
LIST_DIRECTORIES False
${FLEXFLOW_ROOT}/include/*.h)
list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h)

file(GLOB_RECURSE FLEXFLOW_SRC
LIST_DIRECTORIES False
${FLEXFLOW_ROOT}/src/*.cc)
list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc")
list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc)

set(FLEXFLOW_CPP_DRV_SRC
${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
Expand Down Expand Up @@ -460,6 +462,7 @@ if (FF_USE_PYTHON)
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
)
# create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
# create set_python_envs.sh script to set up the environment variables to run flexflow_python
if (NOT FF_BUILD_FROM_PYPI)
add_custom_command(TARGET flexflow
PRE_BUILD
Expand Down
5 changes: 3 additions & 2 deletions FlexFlow.mk
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ GEN_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cc')\
$(shell find $(FF_HOME)/src/runtime/ -name '*.cc')\
$(shell find $(FF_HOME)/src/utils/dot/ -name '*.cc')\
$(shell find $(FF_HOME)/src/dataloader/ -name '*.cc')\
$(shell find $(FF_HOME)/src/c/ -name '*.cc')
$(shell find $(FF_HOME)/src/c/ -name '*.cc')\
$(shell find $(FF_HOME)/inference/ -name 'file_loader.cc')
GEN_SRC := $(filter-out $(FF_HOME)/src/runtime/cpp_driver.cc, $(GEN_SRC))

FF_CUDA_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cu')\
Expand Down Expand Up @@ -94,7 +95,7 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1)
endif


INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src
INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src
CC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
NVCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
HIPCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
Expand Down
7 changes: 4 additions & 3 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,11 @@ export FF_HOME=/path/to/FlexFlow
### Run FlexFlow Python examples
The Python examples are in the [examples/python](https://github.com/flexflow/FlexFlow/tree/master/examples/python). The native, Keras integration and PyTorch integration examples are listed in `native`, `keras` and `pytorch` respectively.

To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `build` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the following flags:
To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `build` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the required environment flags by running the following command (edit the path if your build folder is not named `build`):

* `export PYTHONPATH="${FF_HOME}/python:${FF_HOME}/build/deps/legion/bindings/python:${PYTHONPATH}"`
* `export LD_LIBRARY_PATH="${FF_HOME}/build:${FF_HOME}/build/deps/legion/lib:${LD_LIBRARY_PATH}"`
```
source ./build/set_python_envs.sh
```

**We recommend that you run the** `mnist_mlp` **test under** `native` **using the following cmd to check if FlexFlow has been installed correctly:**

Expand Down
2 changes: 1 addition & 1 deletion include/flexflow/ffconst.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ enum OperatorType {
OP_INVALID,
};

enum ModelType { UNKNOWN, LLAMA, OPT, FALCON };
enum ModelType { UNKNOWN = 3001, LLAMA = 3002, OPT = 3003, FALCON = 3004 };

enum PMParameter {
PM_OP_TYPE, // AnyOp
Expand Down
114 changes: 89 additions & 25 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ FF_NEW_OPAQUE_TYPE(flexflow_tree_verify_batch_config_t);
FF_NEW_OPAQUE_TYPE(flexflow_beam_search_batch_config_t);
FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t);
FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t);
FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t);
FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);

// -----------------------------------------------------------------------
// FFConfig
Expand All @@ -78,6 +80,21 @@ int flexflow_config_get_epochs(flexflow_config_t handle);

bool flexflow_config_get_enable_control_replication(flexflow_config_t handle);

int flexflow_config_get_data_parallelism_degree(flexflow_config_t handle_);

int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_);

int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_);

void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_,
int value);

void flexflow_config_set_tensor_parallelism_degree(flexflow_config_t handle_,
int value);

void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_,
int value);

int flexflow_config_get_python_data_loader_type(flexflow_config_t handle);

// -----------------------------------------------------------------------
Expand Down Expand Up @@ -390,8 +407,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention(
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
char const *name);

flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention(
Expand All @@ -405,8 +426,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention(
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
char const *name);

flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
Expand All @@ -420,8 +445,27 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
char const *name);

flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
flexflow_model_t handle_,
const flexflow_tensor_t input_,
int embed_dim,
int num_heads,
int kdim,
int vdim,
float dropout,
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
char const *name);

flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
Expand All @@ -447,6 +491,11 @@ flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
float top_p,
char const *name);

flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
const flexflow_tensor_t input_,
bool beam_search,
char const *name);

void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
flexflow_sgd_optimizer_t optimizer);

Expand All @@ -468,6 +517,10 @@ flexflow_perf_metrics_t

void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);

flexflow_generation_result_t flexflow_model_generate(flexflow_model_t handle_,
char const *text,
int max_seq_length);

// -----------------------------------------------------------------------
// Tensor
// -----------------------------------------------------------------------
Expand Down Expand Up @@ -809,44 +862,55 @@ void flexflow_beam_search_batch_config_destroy(
// RequestManager
// -----------------------------------------------------------------------

flexflow_request_manager_t flexflow_request_manager_create(void);
flexflow_request_manager_t flexflow_request_manager_get_request_manager(void);

// void flexflow_request_manager_destroy(flexflow_request_manager_t handle_);

void flexflow_request_manager_destroy(flexflow_request_manager_t handle);
void flexflow_request_manager_register_tokenizer(
flexflow_request_manager_t handle_,
enum ModelType model_type,
char const *tokenizer_filepath);

long unsigned int flexflow_request_manager_register_new_request(
flexflow_request_manager_t handle,
char const *prompt,
int max_sequence_length);
void flexflow_request_manager_register_output_filepath(
flexflow_request_manager_t handle_, char const *output_filepath);

int flexflow_request_manager_register_ssm_model(
flexflow_request_manager_t handle_, flexflow_model_t model_handle_);

// -----------------------------------------------------------------------
// InferenceManager
// -----------------------------------------------------------------------

flexflow_inference_manager_t
flexflow_inference_manager_create(flexflow_config_t config_handle,
int max_num_tokens_per_batch);
flexflow_inference_manager_get_inference_manager(void);

void flexflow_inference_manager_destroy(flexflow_inference_manager_t handle);
// void flexflow_inference_manager_destroy(flexflow_inference_manager_t
// handle_);

void flexflow_inference_manager_compile_model_and_allocate_buffer(
flexflow_inference_manager_t handle, flexflow_model_t model_handle);
flexflow_inference_manager_t handle_, flexflow_model_t model_handle);

void flexflow_inference_manager_init_operators_inference(
flexflow_inference_manager_t handle, flexflow_model_t model_handle);

void flexflow_inference_manager_incr_decoding_loop(
flexflow_inference_manager_t handle,
flexflow_model_t model_handle,
flexflow_request_manager_t rm_handle,
int total_num_requests);

void flexflow_inference_manager_spec_inference_loop(
flexflow_inference_manager_t handle,
flexflow_model_t model_handle,
flexflow_request_manager_t rm_handle,
int total_num_requests,
int num_ssms,
int *ssm_model_ids);
flexflow_inference_manager_t handle_, flexflow_model_t model_handle);

// -----------------------------------------------------------------------
// FileDataLoader
// -----------------------------------------------------------------------

flexflow_file_data_loader_t
flexflow_file_data_loader_create(char const *weight_file_path,
int num_heads,
int hidden_dim,
int qkv_inner_dim);

void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);

void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
flexflow_model_t model_handle_,
int num_layers,
char const **layer_names,
flexflow_op_t *layers,
bool use_full_precision);

#ifdef __cplusplus
}
Expand Down
1 change: 0 additions & 1 deletion inference/file_loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,6 @@ void FileDataLoader::load_weights(
if (weight == NULL) {
continue;
}

switch (weight->data_type) {
case DT_HALF:
load_single_weight_tensor<half>(ff, weight, i, v.first);
Expand Down
43 changes: 0 additions & 43 deletions inference/flexflow_inference.py

This file was deleted.

5 changes: 0 additions & 5 deletions inference/incr_decoding/incr_decoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,6 @@ void FlexFlow::top_level_task(Task const *task,
RequestManager *rm = RequestManager::get_request_manager();
rm->register_tokenizer(model_type, file_paths.tokenizer_file_path);
rm->register_output_filepath(file_paths.output_file_path);
// InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS);
// RequestManager rm(model_type,
// file_paths.tokenizer_file_path,
// /*verbose*/ verbose,
// file_paths.output_file_path);

FFModel model(ffconfig, ffconfig.cpu_offload);
if (model_type == ModelType::LLAMA) {
Expand Down
6 changes: 1 addition & 5 deletions inference/models/falcon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,7 @@ void FALCON::create_falcon_model(FFModel &ff,
Layer *embedding = ff.layers.back();
weights_layers.emplace("tok_embeddings_weight", embedding);

int num_transformer_layers = falcon_config.n_layers;
int num_transformer_layers_per_stage =
(num_transformer_layers + num_pipeline_stages - 1) / num_pipeline_stages;

for (int i = 0; i < num_transformer_layers; i++) {
for (int i = 0; i < falcon_config.n_layers; i++) {
// set transformer layer id
ff.set_transformer_layer_id(i);
// step 1: attention
Expand Down
6 changes: 3 additions & 3 deletions inference/models/opt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ void OPT::create_opt_model(FFModel &ff,
NULL,
false,
/*scaling query*/ true,
/*sacling factor*/
/*scaling factor*/
pow((opt_config.hidden_size / opt_config.num_attention_heads),
-0.5),
/*qk_prod_scaling*/ false);
Expand All @@ -140,7 +140,7 @@ void OPT::create_opt_model(FFModel &ff,
NULL,
false,
/*scaling query*/ true,
/*sacling factor*/
/*scaling factor*/
pow((opt_config.hidden_size / opt_config.num_attention_heads),
-0.5),
/*qk_prod_scaling*/ false);
Expand All @@ -161,7 +161,7 @@ void OPT::create_opt_model(FFModel &ff,
NULL,
false,
/*scaling query*/ true,
/*sacling factor*/
/*scaling factor*/
pow((opt_config.hidden_size / opt_config.num_attention_heads),
-0.5),
/*qk_prod_scaling*/ false);
Expand Down
Loading
Loading