Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SpecInfer] Update RequestManager #1096

Merged
merged 43 commits into from
Sep 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
6451a3b
Reorder pipeline.
Aug 18, 2023
6613f90
Merge branch 'inference' into update_rm
zwang86 Aug 18, 2023
1de3e21
refactor and small fixes.
Aug 22, 2023
46344ed
Merge branch 'inference' into update_rm
zwang86 Aug 22, 2023
a15d814
Merge branch 'inference' into update_rm
zwang86 Aug 23, 2023
c37235f
Update
zwang86 Aug 25, 2023
3c93dbf
Merge branch 'inference' into update_rm
zwang86 Aug 28, 2023
d18926f
Refactor backup.
zwang86 Sep 5, 2023
99bb696
pipeline update.
zwang86 Sep 5, 2023
83ae640
Merge branch 'inference' into update_rm_backup
zwang86 Sep 5, 2023
e6f2474
Format.
zwang86 Sep 5, 2023
c758c9f
fix
xinhaoc Sep 7, 2023
0b6b146
.
xinhaoc Sep 7, 2023
709ce3c
Merge branch 'inference' into update_rm_backup
zwang86 Sep 7, 2023
683c283
fix
xinhaoc Sep 11, 2023
d44c1a1
fix
xinhaoc Sep 11, 2023
35a33e5
fix.
zwang86 Sep 11, 2023
0d7524a
Fix reloading new request with long prompts.
zwang86 Sep 11, 2023
7c8227d
Fix edge cases.
zwang86 Sep 11, 2023
230e0bc
Fix edge case
zwang86 Sep 11, 2023
9ed2684
fix
zwang86 Sep 11, 2023
87ef9cb
try a fix to CI
xinhaoc Sep 12, 2023
8898493
.
xinhaoc Sep 12, 2023
e328e2d
fix
xinhaoc Sep 12, 2023
960e938
Merge branch 'inference' into update_rm_backup
zwang86 Sep 12, 2023
3a25189
Fix: clean up code and fix decoding_steps.
zwang86 Sep 13, 2023
c66a205
Merge branch 'update_rm_backup' of https://github.com/flexflow/FlexFl…
zwang86 Sep 13, 2023
c7f1b9e
try 1 try
xinhaoc Sep 14, 2023
55eb913
fix: allow parse 0 tokens for pending request.
zwang86 Sep 16, 2023
b88c4de
format.
zwang86 Sep 16, 2023
abcf94f
remove comment tests
xinhaoc Sep 16, 2023
2327316
Merge branch 'inference' into update_rm_backup
xinhaoc Sep 18, 2023
66ee367
remove print.
zwang86 Sep 19, 2023
8e4fe9a
Merge branch 'update_rm_backup' of https://github.com/flexflow/FlexFl…
zwang86 Sep 19, 2023
2769dcb
Merge branch 'inference' into update_rm_backup
jiazhihao Sep 21, 2023
bf382b4
Merge branch 'inference' into update_rm_backup
xinhaoc Sep 24, 2023
801c56c
fix decoding steps
xinhaoc Sep 24, 2023
1d18fce
.
xinhaoc Sep 24, 2023
aed8850
quick fix.
zwang86 Sep 25, 2023
6638cd3
Merge branch 'inference' into update_rm_backup
zwang86 Sep 25, 2023
a39fb5b
remove debugging prints.
zwang86 Sep 25, 2023
84a6fba
fix store_beam_metadata.
zwang86 Sep 25, 2023
59acaeb
hip
xinhaoc Sep 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,14 @@ class BatchConfig {
void print() const;
virtual InferenceMode get_mode() const;
static BatchConfig const *from_future(BatchConfigFuture const &future);
static int const MAX_NUM_REQUESTS = 1;
static int const MAX_NUM_REQUESTS = 4;
static int const MAX_NUM_TOKENS = 64;
static int const MAX_PROMPT_LENGTH = 62;
static int const MAX_SEQ_LENGTH = 256;

// These are set by update
int num_tokens;
bool loading_prompt = false;

struct PerRequestInfo {
int token_start_offset;
Expand All @@ -69,6 +70,7 @@ class BatchConfig {
PerTokenInfo tokensInfo[MAX_NUM_TOKENS];

bool request_completed[MAX_NUM_REQUESTS];
bool request_running[MAX_NUM_TOKENS];
zwang86 marked this conversation as resolved.
Show resolved Hide resolved
};

class TreeVerifyBatchConfig : public BatchConfig {
Expand Down Expand Up @@ -113,7 +115,6 @@ class BeamSearchBatchConfig : public BatchConfig {
inline static int const MAX_BEAM_DEPTH = 8;

int model_id;
int max_init_length = 0;

struct BeamSearchPerRequestInfo {
int beam_size;
Expand Down
5 changes: 3 additions & 2 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,8 @@ enum TaskIDs {
RM_LOAD_TOKENS_TASK_ID,
RM_LOAD_POSITION_TASK_ID,
RM_PREPARE_NEXT_BATCH_TASK_ID,
RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
// Custom tasks
CUSTOM_GPU_TASK_ID_FIRST,
Expand Down Expand Up @@ -787,7 +787,8 @@ class FFModel {
// ========================================
// Inference APIs
// ========================================
GenerationResult generate(std::string const &text, int max_seq_length);
GenerationResult generate(std::vector<std::string> &prompts,
int max_seq_length);

Tensor create_tensor_legion_ordering(int num_dim,
int const dims[],
Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/ops/kernels/softmax_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ class SoftmaxMeta : public OpMeta {
Legion::Domain const &input_domain);
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
cudnnTensorDescriptor_t inputTensor;
cudnnTensorDescriptor_t outputTensor;
#else
miopenTensorDescriptor_t inputTensor;
miopenTensorDescriptor_t outputTensor;
#endif
bool profiling;
int dim;
Expand Down
14 changes: 9 additions & 5 deletions include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,17 @@ class InferenceManager {

struct Request {
enum Status {
PENDING = 101,
RUNNING = 102,
COMPLETED = 103,
PENDING = 101, // loading prompt
RUNNING = 102, // running inference
COMPLETED = 103, // finished and verified
FINISHING = 104, // finishing request, but not yet verified
};
BatchConfig::RequestGuid guid;
int max_sequence_length;
int initial_len;
int ssm_cache_size = 0;
int llm_cache_size = 0;

Status status = PENDING;
std::vector<BatchConfig::TokenId> tokens;

Expand Down Expand Up @@ -102,10 +106,10 @@ class RequestManager {
FFModel *get_model(int model_id);

GenerationResult generate_incr_decoding(FFModel *model,
std::string const &text,
std::vector<std::string> &prompts,
int max_seq_length);
GenerationResult generate_spec_infer(FFModel *model,
std::string const &text,
std::vector<std::string> &prompts,
int max_seq_length);
GenerationResult get_generation_result(RequestGuid const &guid);
RequestGuid register_new_request(std::string const &prompt,
Expand Down
6 changes: 4 additions & 2 deletions inference/incr_decoding/incr_decoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -242,13 +242,15 @@ void FlexFlow::top_level_task(Task const *task,
/*parser_callback_t */ nullptr,
/*allow_exceptions */ true,
/*ignore_comments */ true);
std::vector<std::string> prompts;
for (auto &prompt : prompt_json) {
std::string text = prompt.get<std::string>();
printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
total_num_requests++;
GenerationResult result =
model.generate(text, 128 /*max_sequence_length*/);
prompts.push_back(text);
}
GenerationResult result =
model.generate(prompts, 128 /*max_sequence_length*/);
}

// Execution fence
Expand Down
6 changes: 5 additions & 1 deletion inference/spec_infer/spec_infer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -384,12 +384,16 @@ void FlexFlow::top_level_task(Task const *task,
/*parser_callback_t */ nullptr,
/*allow_exceptions */ true,
/*ignore_comments */ true);

std::vector<std::string> prompts;
for (auto &prompt : prompt_json) {
std::string text = prompt.get<std::string>();
printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
total_num_requests++;
tree_model.generate(text, 128 /*max_sequence_length*/);
prompts.push_back(text);
// tree_model.generate(text, 128 /*max_sequence_length*/);
}
tree_model.generate(prompts, 128 /*max_sequence_length*/);
}

// Execution fence
Expand Down
4 changes: 3 additions & 1 deletion src/c/flexflow_c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1529,8 +1529,10 @@ flexflow_generation_result_t
int max_seq_length,
int *output_length_and_tokens) {
FFModel *handle = FFCObjectWrapper::unwrap(handle_);
std::vector<std::string> prompts;
std::string const text_str(input_text);
GenerationResult result = handle->generate(text_str, max_seq_length);
prompts.push_back(input_text);
GenerationResult result = handle->generate(prompts, max_seq_length);
DEBUG_PRINT("[Model] generate %p %s %i", handle, text_str, max_seq_length);
assert(result.output_tokens.size() <= max_seq_length);
output_length_and_tokens[0] = result.output_tokens.size();
Expand Down
2 changes: 1 addition & 1 deletion src/mapper/mapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,8 @@ void FFMapper::select_task_options(const MapperContext ctx,
return;
}
if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) ||
(task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) ||
(task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) ||
(task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) ||
(task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID)) {
output.initial_proc = all_cpus[0];
return;
Expand Down
2 changes: 1 addition & 1 deletion src/ops/argmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,

if (m->beam_search) {
// set all parents id zero in arg top1 case.
checkCUDA(hipMemset(parent, 0, batch_size * sizeof(int)));
checkCUDA(hipMemsetAsync(parent, 0, batch_size * sizeof(int), stream));
}
int num_shards = 0;
int k = 1;
Expand Down
4 changes: 2 additions & 2 deletions src/ops/argmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
DT alpha = 1.0f, beta = 0.0f;
if (m->beam_search) {
// set all parents id zero in arg top1 case.
checkCUDA(cudaMemset(parent, 0, batch_size * sizeof(int)));
checkCUDA(cudaMemsetAsync(parent, 0, batch_size * sizeof(int), stream));
}
size_t temp_storage_bytes = m->temp_storage_bytes;
// use cub
Expand All @@ -83,6 +83,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
prob_ptr,
batch_size,
m->beam_search);
// print_tensor<int>(indices_ptr, 32, "argmax op");
}

/*static*/
Expand All @@ -93,7 +94,6 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
int batch_size) {
cudaStream_t stream;
checkCUDA(get_legion_stream(&stream));

cudaEvent_t t_start, t_end;
if (m->profiling) {
cudaEventCreate(&t_start);
Expand Down
5 changes: 4 additions & 1 deletion src/ops/kernels/softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
checkCUDNN(
cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain));
checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
checkCUDNN(
cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain));
dim = softmax->dim;
profiling = softmax->profiling;
std::strcpy(op_name, softmax->name);
Expand Down Expand Up @@ -127,7 +130,7 @@ void forward_kernel(SoftmaxMeta const *m,
m->inputTensor,
input_ptr,
&beta,
m->inputTensor,
m->outputTensor,
output_ptr,
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_CHANNEL));
Expand Down
6 changes: 4 additions & 2 deletions src/ops/kernels/softmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
inputTensor, input_domain, softmax->data_type));
checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
outputTensor, input_domain, softmax->data_type));
dim = softmax->dim;
profiling = softmax->profiling;
std::strcpy(op_name, softmax->name);
Expand All @@ -42,7 +45,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
DT *output_ptr) {
cudaStream_t stream;
checkCUDA(get_legion_stream(&stream));

cudaEvent_t t_start, t_end;
if (m->profiling) {
cudaEventCreate(&t_start);
Expand Down Expand Up @@ -127,7 +129,7 @@ void forward_kernel(SoftmaxMeta const *m,
m->inputTensor,
input_ptr,
&beta,
m->inputTensor,
m->outputTensor,
output_ptr));
}

Expand Down
8 changes: 7 additions & 1 deletion src/ops/spec_inc_multihead_self_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
if (bc->request_completed[i]) {
continue;
}

for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) {

// int num_new_tokens = bc->num_processing_tokens[i];
Expand All @@ -259,6 +260,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
int total_tokens = bc->requestsInfo[i].token_start_offset +
bc->requestsInfo[i].num_tokens_in_batch;

if (num_new_tokens <= 0) {
continue;
}

// Compute (QK^T/sqrt(d_k))
int m_ = num_new_tokens;
int n = total_tokens;
Expand Down Expand Up @@ -543,7 +549,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize);
}

assert(tokens_previous_requests == num_tokens);
// assert(tokens_previous_requests == num_tokens);
}

template <typename DT>
Expand Down
Loading
Loading