Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

peft alignment #1268

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions src/ops/add_bias_residual_layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -618,12 +618,15 @@ void AddBiasResidualLayerNorm::inference_task(

assert(task->regions.size() == regions.size());
BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
if (bc->num_tokens == 0) {
return;
}


AddBiasResidualLayerNormMeta *m =
*((AddBiasResidualLayerNormMeta **)task->local_args);
std::string op_name_without_uid = AddBiasResidualLayerNorm::get_op_name_without_uid(m);
std::cout << "INF " << op_name_without_uid << std::endl;
if (bc->num_tokens == 0) {
return;
}

assert(regions.size() ==
5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
Expand Down Expand Up @@ -1003,6 +1006,8 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
ctx,
runtime);
}
std::string op_name_without_uid = AddBiasResidualLayerNorm::get_op_name_without_uid(m);
std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " << m->reset_input_grads[0] << " reset_in_grad[1]: " << m->reset_input_grads[1] << std::endl;
AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
m, output_grad, input_grad, residual_grad, gamma);

Expand Down
5 changes: 0 additions & 5 deletions src/ops/argmax.cc
Original file line number Diff line number Diff line change
Expand Up @@ -392,11 +392,6 @@ InferenceResult
GenericTensorAccessorW parent;
int batch_size = bc->num_active_infr_tokens();
ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
// Note that we free activation allocator here since argmax is the
// last operator in forward
if (m->handle.peft_activation_allocator != nullptr) {
m->handle.peft_activation_allocator->free_all();
}
InferenceResult ir;
if (m->inference_debugging) {
assert(task->index_point.get_dim() == 1);
Expand Down
5 changes: 5 additions & 0 deletions src/ops/fused.cc
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,11 @@ FutureMap FusedOp::inference(FFModel const &ff,
// so we transfer the maximum of them
// size_t batch_config_size =
// std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
printf("FUSED! INFERENCE! %i ops\n", numOperators);
for (int i=0; i<numOperators; i++) {
Op *oppp = operators[i];
std::cout << oppp->op_type << " " << oppp->name << std::endl;
}
IndexLauncher launcher(FUSEDOP_INF_TASK_ID,
parallel_is,
TaskArgument(nullptr, 0),
Expand Down
52 changes: 49 additions & 3 deletions src/ops/inc_multihead_self_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -818,12 +818,16 @@ void IncMultiHeadSelfAttention::inference_task(
log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d",
bc->num_tokens,
bc->num_active_requests());
if (bc->num_tokens == 0) {
return;
}


IncMultiHeadSelfAttentionMeta *m =
*((IncMultiHeadSelfAttentionMeta **)task->local_args);
std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m);
std::cout << "INF " << op_name_without_uid << std::endl;

if (bc->num_tokens == 0) {
return;
}

assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
: regions.size() == 3));
Expand Down Expand Up @@ -876,6 +880,37 @@ void IncMultiHeadSelfAttention::inference_task(
}
}

template <typename DT>
void load_tensor_from_file(DT *ptr, size_t size, std::string filepath) {
std::ifstream in(filepath, std::ios::in | std::ios::binary);
if (!in.good()) {
std::cout << "Could not open file: " << filepath << std::endl;
}
assert(in.good() && "incorrect weight file path");
std::vector<DT> host_array(size);
size_t loaded_data_size = sizeof(DT) * size;
in.seekg(0, in.end);
in.seekg(0, in.beg);
in.read((char *)host_array.data(), loaded_data_size);

size_t in_get_size = in.gcount();
if (in_get_size != loaded_data_size) {
std::cout << "load weight data error " << in_get_size << ", "
<< loaded_data_size << ", " << sizeof(DT) << std::endl;
assert(false);
}
assert(size == host_array.size());

copy_tensor_host_to_dev(ptr, host_array.data(), size);

// // normal
// long data_index = 0;
// for (auto v : host_array) {
// ptr[data_index++] = v;
// }
in.close();
}

FutureMap IncMultiHeadSelfAttention::peft_bwd(
FFModel const &ff,
BatchConfigFuture const &bc,
Expand Down Expand Up @@ -992,6 +1027,17 @@ void IncMultiHeadSelfAttention::peft_bwd_task(

assert(task->index_point.get_dim() == 1);

std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m);
std::cout << "BWD " << op_name_without_uid << std::endl;

if (op_name_without_uid == "layers_11_attention") {
load_tensor_from_file(
output_grad.get_float_ptr(),
(output_grad.domain.get_volume()/128)*24,
"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0.flexflow"
);
}

IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
m,
bc,
Expand Down
115 changes: 114 additions & 1 deletion src/ops/inc_multihead_self_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,8 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
m->hidden_size);
}
if (*m->apply_rotary_embedding) {
printf("ROTARY EMBEDDING: num_tokens: %i, q_array_size: %i, m->hidden_size: %i\n",
num_tokens, q_array_size, m->hidden_size);
/*q&k*/
parallelism = num_tokens * m->hidden_size;
apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
Expand Down Expand Up @@ -934,6 +936,26 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
// compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
// }
// #endif
std::string op_name_without_uid = std::string(m->op_name);
size_t last_underscore = op_name_without_uid.length() - 1;
for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
break;
} else if (m->op_name[i] == '_') {
last_underscore = i;
}
}
op_name_without_uid.erase(last_underscore);

std::string base_filepath =
"./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
"_bwd-step_" + std::to_string(m->bwd_step) +
"_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) +
"_layer-name_" + op_name_without_uid + "_shard-id_" +
std::to_string(shard_id);



for (int i = 0; i < bc->max_requests_per_batch(); i++) {
if (bc->request_completed[i]) {
continue;
Expand Down Expand Up @@ -995,6 +1017,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
ldc,
compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (m->inference_debugging) {
// save result to file for checking
std::string filename = base_filepath + "_o_proj_in_grad";
std::cout << "FILENAME: " << filename << std::endl;
save_tensor(C, m_*n_, filename.c_str());
}
}
// Step 2: compute gradients w.r.t. value
{
Expand Down Expand Up @@ -1046,6 +1074,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
m->num_q_heads,
compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
// save result to file for checking
if (m->inference_debugging) {
std::string filename = base_filepath + "_v_proj_in_grad";
std::cout << "FILENAME: " << filename << std::endl;
save_tensor(C, m_*n_*m->num_q_heads, filename.c_str());
std::string filename2 = base_filepath + "_qk_prods_softmax";
std::cout << "FILENAME: " << filename2 << std::endl;
save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str());
}
}
// Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
{
Expand Down Expand Up @@ -1094,6 +1131,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
m->num_q_heads,
compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (m->inference_debugging) {
std::string filename4 = base_filepath + "_qk_prods_softmax_grad";
std::cout << "FILENAME: " << filename4 << std::endl;
save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename4.c_str());
std::string filename5 = base_filepath + "_vcache";
std::cout << "FILENAME: " << filename5 << std::endl;
save_tensor(B, m->vProjSize * m->num_q_heads * num_tokens, filename5.c_str());
}
}
// Step 4: softmax backpropagation
{
Expand All @@ -1120,6 +1165,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
&beta,
m->qk_tensor,
m->qk_prods));

if (m->inference_debugging) {
DT *C = static_cast<DT *>(m->qk_prods);
std::string filename6 = base_filepath + "_qk_prods_softmax_grad_in";
std::cout << "FILENAME: " << filename6 << std::endl;
save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename6.c_str());
}

// TODO: fill all elements above diagonal to force causal attention
size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
if (entries_above_diagonal > 0) {
Expand All @@ -1135,6 +1188,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
entries_above_diagonal,
DT(0.0f));
}
if (m->inference_debugging) {
DT *C = static_cast<DT *>(m->qk_prods);
std::string filename7 = base_filepath + "_qk_prods_softmax_grad_in_masked";
std::cout << "FILENAME: " << filename7 << std::endl;
save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename7.c_str());
}
}
// Step 5: compute gradients w.r.t. key
{
Expand Down Expand Up @@ -1189,6 +1248,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
m->num_q_heads,
compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (m->inference_debugging) {
std::string filename8 = base_filepath + "_query_activation";
std::cout << "FILENAME: " << filename8 << std::endl;
save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str());
std::string filename9 = base_filepath + "_devkproj_pre";
std::cout << "FILENAME: " << filename9 << std::endl;
save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
}
}
// Step 6: compute gradients w.r.t query
{
Expand All @@ -1208,7 +1275,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
// after transposition & striding
int m_ = num_tokens; // num_new_tokens
int n_ = m->qProjSize;
int k_ = num_tokens;
int k_ = num_tokens;
// before transposition and striding
int lda = num_tokens; // num_new_tokens
int ldb = m->qProjSize * m->num_q_heads;
Expand Down Expand Up @@ -1239,6 +1306,47 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
m->num_q_heads,
compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (m->inference_debugging) {
std::string filename3 = base_filepath + "_devQKVPRojArray_pre";
std::cout << "FILENAME: " << filename3 << std::endl;
save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
}
}

// Compute rotary embeddings bwd
{
if (*m->apply_rotary_embedding) {
assert(m->hidden_size == m->qProjSize * m->num_q_heads);
assert(m->qProjSize == m->kProjSize);
printf("ROTARY EMBEDDING bwd: num_tokens: %i, m->hidden_size: %i\n", num_tokens, m->hidden_size);
/*q&k*/
int parallelism = num_tokens * m->hidden_size;
DT *A = static_cast<DT *>(m->devQKVProjArray);
apply_rotary_embedding_bwd<<<GET_BLOCKS(parallelism),
min(CUDA_NUM_THREADS, parallelism),
0,
stream>>>(A,
m->complex_input,
m->token_infos,
m->qProjSize,
num_tokens,
m->hidden_size);
DT *C = static_cast<DT *>(m->devQKVProjArray);
if (m->inference_debugging) {
std::string filename3 = base_filepath + "_devQKVPRojArray";
std::cout << "FILENAME: " << filename3 << std::endl;
save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
}
}

// matrix C: gradients for key (saved as part of m->devQKVProjArray)
// matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
DT *C = static_cast<DT *>(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients
if (m->inference_debugging) {
std::string filename9 = base_filepath + "_devkproj";
std::cout << "FILENAME: " << filename9 << std::endl;
save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
}
}
// Step 7: perform rotary position embeddings (RoPE) bwd
{
Expand Down Expand Up @@ -1300,6 +1408,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
ldc,
compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (m->inference_debugging) {
std::string filename12 = base_filepath + "_attn_final_grad_in";
std::cout << "FILENAME: " << filename12 << std::endl;
save_tensor(C, num_tokens * m->qSize, filename12.c_str());
}
}
}
}
Expand Down
25 changes: 13 additions & 12 deletions src/ops/kernels/softmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,11 @@ __global__ void sparse_categorical_crossentropy_loss_peft_backward(
int num_tokens,
int num_classes) {
CUDA_KERNEL_LOOP(i, num_tokens * num_classes) {
input_grad[i] = output_grad[i];
if (i % num_classes == token_ids[i / num_classes]) {
input_grad[i] -= 1.0f;
}
input_grad[i] = 0.5;
// input_grad[i] = output_grad[i];
// if (i % num_classes == token_ids[i / num_classes]) {
// input_grad[i] -= 1.0f;
// }
}
}

Expand Down Expand Up @@ -345,14 +346,14 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
num_bwd_tokens,
num_classes);
// scale
scale_kernel<<<GET_BLOCKS(num_bwd_tokens * num_classes),
CUDA_NUM_THREADS,
0,
stream>>>(input_grad_ptr +
tokens_previous_requests * num_classes,
num_bwd_tokens * num_classes,
DT(0.0),
scale_factor);
// scale_kernel<<<GET_BLOCKS(num_bwd_tokens * num_classes),
// CUDA_NUM_THREADS,
// 0,
// stream>>>(input_grad_ptr +
// tokens_previous_requests * num_classes,
// num_bwd_tokens * num_classes,
// DT(0.0),
// scale_factor);

tokens_previous_requests += num_bwd_tokens;
}
Expand Down
5 changes: 5 additions & 0 deletions src/ops/linear.cc
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,8 @@ void Linear::inference_task(Task const *task,
ctx, task->regions[0].region.get_index_space());
LinearMeta *m = *((LinearMeta **)task->local_args);
BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
std::string op_name_without_uid = Linear::get_op_name_without_uid(m);
printf("INF %s\n", op_name_without_uid.c_str());
if (bc->num_tokens == 0) {
return;
}
Expand Down Expand Up @@ -757,6 +759,9 @@ void Linear::peft_bwd_task(Task const *task,
int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;

std::string op_name_without_uid = Linear::get_op_name_without_uid(m);
std::cout << "BWD " << op_name_without_uid << std::endl;

int num_infr_tokens = bc->num_active_infr_tokens();
int num_peft_tokens = bc->num_active_peft_tokens();
if (m->inference_debugging) {
Expand Down
2 changes: 2 additions & 0 deletions src/ops/lora_linear.cc
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,8 @@ void LoraLinear::inference_task(Task const *task,
Context ctx,
Runtime *runtime) {
LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m);
std::cout << "INF " << op_name_without_uid << std::endl;
BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
if (bc->num_active_tokens() == 0) {
return;
Expand Down
Loading
Loading