Skip to content

Commit

Permalink
make inc_decoding work
Browse files Browse the repository at this point in the history
  • Loading branch information
jiazhihao committed Jul 23, 2023
1 parent 781fe21 commit 8a58aed
Show file tree
Hide file tree
Showing 22 changed files with 413 additions and 330 deletions.
1 change: 1 addition & 0 deletions examples/cpp/inference/dataloader.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "dataloader.h"
#include "flexflow/inference.h"
#include "flexflow/request_manager.h"
#include "flexflow/utils/cuda_helper.h"

void DataLoader::load_input(Task const *task,
Expand Down
1 change: 1 addition & 0 deletions examples/cpp/inference/mixture_of_experts/moe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "moe.h"
#include "flexflow/inference.h"
#include "flexflow/request_manager.h"
#include <cstdlib>
#include <fstream>
#include <iostream>
Expand Down
1 change: 1 addition & 0 deletions examples/cpp/inference/transformers/transformers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "transformers.h"
#include "flexflow/inference.h"
#include "flexflow/request_manager.h"
#include <cstdlib>
#include <fstream>
#include <iostream>
Expand Down
2 changes: 1 addition & 1 deletion include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class BatchConfig {
void print() const;
virtual InferenceMode get_mode() const;
static BatchConfig const *from_future(BatchConfigFuture const &future);
static int const MAX_NUM_REQUESTS = 16;
static int const MAX_NUM_REQUESTS = 1;
static int const MAX_NUM_TOKENS = 64;
static int const MAX_SEQ_LENGTH = 256;

Expand Down
216 changes: 0 additions & 216 deletions include/flexflow/inference.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,10 @@
*/

#pragma once

#include "flexflow/batch_config.h"
#include "flexflow/model.h"
#include <future>
#include <mutex>
#include <tokenizers_cpp.h>

namespace FlexFlow {

class FFModel;
class BeamTree;
class RequestManager;
using tokenizers::Tokenizer;

struct SamplingConfig {
bool do_sample = false;
float temperature = 0.8;
Expand All @@ -50,210 +40,4 @@ struct GenerationResult {
std::vector<TokenId> output_tokens;
};

class InferenceManager {
public:
InferenceManager(FFConfig const &config, int max_num_tokens_per_batch);
static InferenceManager *get_inference_manager();
void compile_model_and_allocate_buffer(FFModel *model);
void init_operators_inference(FFModel *model);
MachineView *get_machine_view(int mv_id);
Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc);
Legion::FutureMap
inference(FFModel *model, int index, BatchConfigFuture const &bc);
void load_input_tokens_from_batch_config(BatchConfigFuture const &bc,
ParallelTensor const input);
void load_positions(BatchConfigFuture const &bc,
ParallelTensor position_input);
void incr_decoding_loop(FFModel *model,
RequestManager &rm,
int total_num_requests);
void spec_inference_loop(FFModel *model,
RequestManager &rm,
int total_num_requests,
std::vector<int> ssm_model_ids);

public:
FFConfig ff_config;
std::unordered_map<ParallelTensor, std::vector<ParallelTensor>> tensor_buffer;
int max_num_tokens_per_batch;
int num_devices;
std::vector<MachineView> machine_views;
};

struct Request {
BatchConfig::RequestGuid guid;
int max_sequence_length;
int initial_len;
std::vector<BatchConfig::TokenId> tokens;

std::vector<struct BeamTree> beam_trees;
std::promise<GenerationResult> *promise;
};

// store the result of beam search
struct BeamTree {
struct treeLayer {
BeamSearchBatchConfig::TokenId
tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH];
int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH];
float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH];
};
treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1];
};

// struct BeamTree_v2 {
// std::vector<BatchConfig::TokenId> tokens;
// std::vector<int> parent_ids;
// std::vector<float> probs;
// };

class RequestManager {
public:
using RequestGuid = BatchConfig::RequestGuid;
using TokenId = BatchConfig::TokenId;
RequestManager(ModelType model_type,
std::string const &path,
bool verbose = false,
std::string output_filepath = "");
RequestManager();
static RequestManager *get_request_manager();
size_t get_num_processed_requests();

int register_new_model(FFModel *model);
void register_tokenizer(ModelType model_type, std::string const &path);
void register_output_filepath(std::string const &);

FFModel *get_model(int model_id);
void serve(FFModel *model);

static GenerationResult generate(std::string const &text, int max_seq_length);
RequestGuid register_new_request(std::string const &prompt,
int max_sequence_length);
RequestGuid register_new_request(std::vector<TokenId> const &prompt,
int max_sequence_length);
BatchConfig prepare_next_batch(BatchConfig const &bc,
InferenceResult const &result);
BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
InferenceResultFuture const &result);
BeamSearchBatchConfig
prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
BeamInferenceResult const &result);
BeamSearchBatchConfigFuture
prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc,
BeamInferenceResultFuture const &result);
BeamSearchBatchConfig
prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
InferenceResult const &result,
int model_id);
BeamSearchBatchConfigFuture
prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc,
InferenceResultFuture const &result,
int model_id);
TreeVerifyBatchConfig prepare_next_batch_verify(
std::vector<BeamSearchBatchConfig> const &old_batches);
TreeVerifyBatchConfigFuture prepare_next_batch_verify(
std::vector<BeamSearchBatchConfigFuture> const &old_batches);

void store_beam_metadata(BeamSearchBatchConfig const &old_bc,
BeamInferenceResult const &result);
void update_beam_metadata(BeamSearchBatchConfig &new_bc,
BeamTree &tree,
int request_index);

std::vector<std::pair<BatchConfig::TokenId, int>>
traverse_beam_tree(BeamSearchBatchConfig const &old_bc,
int request_index,
int token_start_offset);

// remove guid after put the cached tree in request
std::vector<std::pair<BatchConfig::TokenId, int>> merge_dfs_trees(
std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
input_trees,
int root_depth,
RequestGuid guid);

std::vector<std::pair<BatchConfig::TokenId, int>> traverse_verify_tree(
size_t guid,
std::vector<std::pair<BatchConfig::TokenId, int>> const
&inputSerializedTree,
std::vector<std::pair<BatchConfig::TokenId, int>> const
&outputSerializedTree);

static void
load_tokens_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void
load_positions_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);

static BatchConfig prepare_next_batch_task(
Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);

static BeamSearchBatchConfig prepare_next_batch_beam_task(
Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);

static BeamSearchBatchConfig prepare_next_batch_init_task(
Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);

static TreeVerifyBatchConfig prepare_next_batch_verify_task(
Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);

static void llm_serving_background_task(
Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);

private:
std::unique_ptr<Tokenizer> tokenizer_;
bool verbose;
ModelType model_type;
std::string output_filepath;
std::queue<Request> pending_request_queue;
std::unordered_map<RequestGuid, Request> all_requests;
std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
std::mutex request_queue_mutex;
RequestGuid next_available_guid;
const std::map<ModelType, int> model_bos_map = {{ModelType::LLAMA, 0},
{ModelType::OPT, 2}};

// TODO: Move this two vector to request struct
std::unordered_map<RequestGuid,
std::vector<std::pair<BatchConfig::TokenId, int>>>
dfs_tree_inputs;
std::unordered_map<RequestGuid, std::vector<std::pair<int, int>>>
committed_tokens;

// Multi-model support
int num_ssms;
std::vector<FFModel *> models;

// Performance profiling
size_t num_processed_requests;

private:
struct ProfileInfo {
int decoding_steps;
double start_time, finish_time;
};
std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
double total_request_run_time;
};

} // namespace FlexFlow
5 changes: 5 additions & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "accessor.h"
#include "config.h"
#include "device.h"
#include "flexflow/inference.h"
#include "flexflow/memory_optimization.h"
#include "flexflow/node.h"
#include "flexflow/operator_params.h"
Expand Down Expand Up @@ -698,6 +699,10 @@ class FFModel {
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
char const *name = NULL);
// ========================================
// Inference APIs
// ========================================
GenerationResult generate(std::string const &text, int max_seq_length);

Tensor create_tensor_legion_ordering(int num_dim,
int const dims[],
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H
#define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H

#include "flexflow/accessor.h"
#include "flexflow/device.h"
#include "flexflow/fftype.h"
#include "flexflow/inference.h"
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/inc_multiquery_self_attention.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _FLEXFLOW_INC_MULTIQUERY_ATTENTION_H
#define _FLEXFLOW_INC_MULTIQUERY_ATTENTION_H

#include "flexflow/accessor.h"
#include "flexflow/device.h"
#include "flexflow/fftype.h"
#include "flexflow/inference.h"
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/spec_inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_H
#define _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_H

#include "flexflow/accessor.h"
#include "flexflow/device.h"
#include "flexflow/fftype.h"
#include "flexflow/inference.h"
Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/ops/tree_inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H
#define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H

#include "flexflow/accessor.h"
#include "flexflow/device.h"
#include "flexflow/fftype.h"
#include "flexflow/inference.h"
Expand All @@ -9,6 +10,7 @@
#include "flexflow/op_meta.h"
#include "flexflow/operator.h"
#include "flexflow/ops/inc_multihead_self_attention.h"
#include "flexflow/ops/tree_inc_multihead_self_attention_params.h"
#include "math.h"
#include <cfloat>
#include <complex>
Expand Down
Loading

0 comments on commit 8a58aed

Please sign in to comment.