Skip to content

Commit

Permalink
compiled
Browse files Browse the repository at this point in the history
  • Loading branch information
jiazhihao committed Jul 23, 2023
1 parent bff1b54 commit 781fe21
Show file tree
Hide file tree
Showing 14 changed files with 252 additions and 123 deletions.
2 changes: 1 addition & 1 deletion config/config.linux
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ FF_USE_AVX2=${FF_USE_AVX2:-OFF}
FF_MAX_DIM=${FF_MAX_DIM:-5}

# set LEGION_MAX_RETURN_SIZE
LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-131072}
LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-262144}

# set ROCM path
ROCM_PATH=${ROCM_PATH:-"/opt/rocm"}
Expand Down
52 changes: 39 additions & 13 deletions include/flexflow/inference.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "flexflow/batch_config.h"
#include "flexflow/model.h"
#include <future>
#include <mutex>
#include <tokenizers_cpp.h>

Expand All @@ -27,9 +28,32 @@ class BeamTree;
class RequestManager;
using tokenizers::Tokenizer;

struct SamplingConfig {
bool do_sample = false;
float temperature = 0.8;
float topp = 0.6;
SamplingConfig(bool _do_sample, float _temperature, float _topp) {
temperature = _temperature > 0 ? _temperature : temperature;
topp = _topp > 0 ? _topp : topp;
do_sample = _do_sample;
}
SamplingConfig() {}
};

struct GenerationResult {
using RequestGuid = BatchConfig::RequestGuid;
using TokenId = BatchConfig::TokenId;
RequestGuid guid;
std::string input_text;
std::string output_text;
std::vector<TokenId> input_tokens;
std::vector<TokenId> output_tokens;
};

class InferenceManager {
public:
InferenceManager(FFConfig const &config, int max_num_tokens_per_batch);
static InferenceManager *get_inference_manager();
void compile_model_and_allocate_buffer(FFModel *model);
void init_operators_inference(FFModel *model);
MachineView *get_machine_view(int mv_id);
Expand All @@ -47,6 +71,7 @@ class InferenceManager {
RequestManager &rm,
int total_num_requests,
std::vector<int> ssm_model_ids);

public:
FFConfig ff_config;
std::unordered_map<ParallelTensor, std::vector<ParallelTensor>> tensor_buffer;
Expand All @@ -62,6 +87,7 @@ struct Request {
std::vector<BatchConfig::TokenId> tokens;

std::vector<struct BeamTree> beam_trees;
std::promise<GenerationResult> *promise;
};

// store the result of beam search
Expand All @@ -75,18 +101,6 @@ struct BeamTree {
treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1];
};

struct SamplingConfig {
bool do_sample = false;
float temperature = 0.8;
float topp = 0.6;
SamplingConfig(bool _do_sample, float _temperature, float _topp) {
temperature = _temperature > 0 ? _temperature : temperature;
topp = _topp > 0 ? _topp : topp;
do_sample = _do_sample;
}
SamplingConfig() {}
};

// struct BeamTree_v2 {
// std::vector<BatchConfig::TokenId> tokens;
// std::vector<int> parent_ids;
Expand All @@ -102,12 +116,17 @@ class RequestManager {
bool verbose = false,
std::string output_filepath = "");
RequestManager();
static RequestManager *get_request_manager();
size_t get_num_processed_requests();

int register_new_model(FFModel *model);
void register_tokenizer(ModelType model_type, std::string const &path);
void register_output_filepath(std::string const &);

FFModel *get_model(int model_id);
void serve(FFModel *model);

static GenerationResult generate(std::string const &text, int max_seq_length);
RequestGuid register_new_request(std::string const &prompt,
int max_sequence_length);
RequestGuid register_new_request(std::vector<TokenId> const &prompt,
Expand Down Expand Up @@ -195,13 +214,20 @@ class RequestManager {
Legion::Context ctx,
Legion::Runtime *runtime);

static void llm_serving_background_task(
Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);

private:
std::unique_ptr<Tokenizer> tokenizer_;
bool verbose;
ModelType model_type;
std::string output_filepath;
std::queue<Request> pending_request_queue;
std::unordered_map<RequestGuid, Request> running_request_queue;
std::unordered_map<RequestGuid, Request> all_requests;
std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
std::mutex request_queue_mutex;
RequestGuid next_available_guid;
const std::map<ModelType, int> model_bos_map = {{ModelType::LLAMA, 0},
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ enum TaskIDs {
RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
RM_LLM_SERVING_BACKGROUND_TASK_ID,
// Custom tasks
CUSTOM_GPU_TASK_ID_FIRST,
CUSTOM_GPU_TASK_ID_1,
Expand Down
71 changes: 14 additions & 57 deletions inference/incr_decoding/incr_decoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@ void parse_input_args(char **argv,
bool &verbose,
bool &do_sample,
float &temperature,
float &topp,
int &data_parallelism_degree,
int &tensor_parallelism_degree,
int &pipeline_parallelism_degree) {
float &topp) {
for (int i = 1; i < argc; i++) {
// llm model type
if (!strcmp(argv[i], "-llm-model")) {
Expand Down Expand Up @@ -88,21 +85,6 @@ void parse_input_args(char **argv,
paths.output_file_path = std::string(argv[++i]);
continue;
}
// data parallelism degree
if (!strcmp(argv[i], "-data-parallelism-degree")) {
data_parallelism_degree = std::stoi(argv[++i]);
continue;
}
// tensor parallelism degree
if (!strcmp(argv[i], "-tensor-parallelism-degree")) {
tensor_parallelism_degree = std::stoi(argv[++i]);
continue;
}
// pipeline parallelism degree
if (!strcmp(argv[i], "-pipeline-parallelism-degree")) {
pipeline_parallelism_degree = std::stoi(argv[++i]);
continue;
}
if (!strcmp(argv[i], "--use-full-precision")) {
use_full_precision = true;
continue;
Expand Down Expand Up @@ -143,8 +125,6 @@ void FlexFlow::top_level_task(Task const *task,
float temperature = 0.0f;
float topp = 0.0f;
size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes;
int data_parallelism_degree = 1, tensor_parallelism_degree = 1,
pipeline_parallelism_degree = 1;

InputArgs const &command_args = HighLevelRuntime::get_input_args();
char **argv = command_args.argv;
Expand All @@ -157,47 +137,41 @@ void FlexFlow::top_level_task(Task const *task,
verbose,
do_sample,
temperature,
topp,
data_parallelism_degree,
tensor_parallelism_degree,
pipeline_parallelism_degree);
ffconfig.data_parallelism_degree = data_parallelism_degree;
ffconfig.tensor_parallelism_degree = tensor_parallelism_degree;
ffconfig.pipeline_parallelism_degree = pipeline_parallelism_degree;
topp);

assert(data_parallelism_degree * tensor_parallelism_degree *
pipeline_parallelism_degree ==
assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
ffconfig.pipeline_parallelism_degree ==
ffconfig.numNodes * ffconfig.workersPerNode);

assert(model_type != ModelType::UNKNOWN &&
"Invalid LLM model type passed (or no type was passed).");

SamplingConfig samplingConfig(do_sample, temperature, topp);
InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS);
RequestManager rm(model_type,
file_paths.tokenizer_file_path,
/*verbose*/ verbose,
file_paths.output_file_path);
RequestManager *rm = RequestManager::get_request_manager();
rm->register_tokenizer(model_type, file_paths.tokenizer_file_path);
rm->register_output_filepath(file_paths.output_file_path);
// InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS);
// RequestManager rm(model_type,
// file_paths.tokenizer_file_path,
// /*verbose*/ verbose,
// file_paths.output_file_path);

FFModel model(ffconfig, ffconfig.cpu_offload);
if (model_type == ModelType::LLAMA) {
LLAMA::create_llama_model(model,
im,
file_paths.llm_config_file_path,
file_paths.llm_weight_file_path,
INC_DECODING_MODE,
samplingConfig,
use_full_precision);
} else if (model_type == ModelType::OPT) {
OPT::create_opt_model(model,
im,
file_paths.llm_config_file_path,
file_paths.llm_weight_file_path,
INC_DECODING_MODE,
use_full_precision);
} else if (model_type == ModelType::FALCON) {
FALCON::create_falcon_model(model,
im,
file_paths.llm_config_file_path,
file_paths.llm_weight_file_path,
ffconfig.workersPerNode * ffconfig.numNodes,
Expand All @@ -220,27 +194,10 @@ void FlexFlow::top_level_task(Task const *task,
std::string text = prompt.get<std::string>();
printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
total_num_requests++;
rm.register_new_request(text, 128 /*max_sequence_length*/);
}
}

BatchConfig bc;
InferenceResult ir;
BatchConfigFuture bcf = Future::from_value<BatchConfig>(bc);
InferenceResultFuture irf = Future::from_value<InferenceResult>(ir);
while (rm.get_num_processed_requests() < total_num_requests) {
// bc = rm.prepare_next_batch(bc, ir);
bcf = rm.prepare_next_batch(bcf, irf);
if (rm.get_num_processed_requests() >= total_num_requests) {
break;
GenerationResult result =
RequestManager::generate(text, 128 /*max_sequence_length*/);
}
FutureMap fm = im.inference(&model, 0, bcf);
assert(fm.get_future_map_domain().get_volume() == 1);
// Future future = fm.get_future(0);
// ir = future.get_result<InferenceResult>();
irf = fm.get_future(0);
}
// im.incr_decoding_loop(&model, rm, total_num_requests);

// Execution fence
{
Expand Down
6 changes: 3 additions & 3 deletions inference/models/falcon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ namespace FlexFlow {
using namespace Legion;

void FALCON::create_falcon_model(FFModel &ff,
InferenceManager &im,
std::string const &model_config_file_path,
std::string const &weight_file_path,
int num_pipeline_stages,
Expand Down Expand Up @@ -141,7 +140,8 @@ void FALCON::create_falcon_model(FFModel &ff,

// Compile the model
std::cout << "------start compile ----------" << std::endl;
im.compile_model_and_allocate_buffer(&ff);
InferenceManager *im = InferenceManager::get_inference_manager();
im->compile_model_and_allocate_buffer(&ff);
FileDataLoader fileloader("",
weight_file_path,
falcon_config.n_heads,
Expand All @@ -151,7 +151,7 @@ void FALCON::create_falcon_model(FFModel &ff,
std::cout << "------load weight finished----------" << std::endl;

// init operators
im.init_operators_inference(&ff);
im->init_operators_inference(&ff);
}

}; // namespace FlexFlow
1 change: 0 additions & 1 deletion inference/models/falcon.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ class FALCON {
};

static void create_falcon_model(FFModel &ff,
InferenceManager &im,
std::string const &model_config_file_path,
std::string const &weight_file_path,
int num_pipeline_stages,
Expand Down
6 changes: 3 additions & 3 deletions inference/models/llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ namespace FlexFlow {
using namespace Legion;

void LLAMA::create_llama_model(FFModel &ff,
InferenceManager &im,
std::string const &model_config_file_path,
std::string const &weight_file_path,
InferenceMode mode,
Expand Down Expand Up @@ -188,9 +187,10 @@ void LLAMA::create_llama_model(FFModel &ff,
}
}

InferenceManager *im = InferenceManager::get_inference_manager();
// Compile the model
std::cout << "------start compile ----------" << std::endl;
im.compile_model_and_allocate_buffer(&ff);
im->compile_model_and_allocate_buffer(&ff);
FileDataLoader fileloader("",
weight_file_path,
llama_config.n_heads,
Expand All @@ -200,7 +200,7 @@ void LLAMA::create_llama_model(FFModel &ff,
std::cout << "------load weight finished----------" << std::endl;

// init operators
im.init_operators_inference(&ff);
im->init_operators_inference(&ff);
}

}; // namespace FlexFlow
1 change: 0 additions & 1 deletion inference/models/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ class LLAMA {
};

static void create_llama_model(FFModel &ff,
InferenceManager &im,
std::string const &model_config_file_path,
std::string const &weight_file_path,
InferenceMode mode,
Expand Down
6 changes: 3 additions & 3 deletions inference/models/opt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ namespace FlexFlow {
using namespace Legion;

void OPT::create_opt_model(FFModel &ff,
InferenceManager &im,
std::string const &model_config_file_path,
std::string const &weight_file_path,
InferenceMode mode,
Expand Down Expand Up @@ -222,7 +221,8 @@ void OPT::create_opt_model(FFModel &ff,

//------------------- compile the model --------------------------------
std::cout << "------start compile ----------" << std::endl;
im.compile_model_and_allocate_buffer(&ff);
InferenceManager *im = InferenceManager::get_inference_manager();
im->compile_model_and_allocate_buffer(&ff);
FileDataLoader fileloader("",
weight_file_path,
opt_config.num_attention_heads,
Expand All @@ -231,7 +231,7 @@ void OPT::create_opt_model(FFModel &ff,
opt_config.num_attention_heads);
fileloader.load_weights(&ff, weights_layers, use_full_precision);
std::cout << "------finished loading weights----------" << std::endl;
im.init_operators_inference(&ff);
im->init_operators_inference(&ff);
}

}; // namespace FlexFlow
1 change: 0 additions & 1 deletion inference/models/opt.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ class OPT {
};

static void create_opt_model(FFModel &ff,
InferenceManager &im,
std::string const &model_config_file_path,
std::string const &weight_file_path,
InferenceMode mode,
Expand Down
Loading

0 comments on commit 781fe21

Please sign in to comment.