Skip to content

Commit

Permalink
linting
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro authored and yingchen21 committed Sep 11, 2024
1 parent 73468c5 commit 104ba3c
Show file tree
Hide file tree
Showing 17 changed files with 282 additions and 221 deletions.
15 changes: 8 additions & 7 deletions include/flexflow/ops/inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,14 @@ class IncMultiHeadSelfAttention : public Op {
int shard_id,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);
static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
BatchConfig const *bc,
int shard_id,
GenericTensorAccessorW const &input_grad,
// GenericTensorAccessorR const &weight,
GenericTensorAccessorR const &output_grad);
// GenericTensorAccessorR const &bias);
static void
peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
BatchConfig const *bc,
int shard_id,
GenericTensorAccessorW const &input_grad,
// GenericTensorAccessorR const &weight,
GenericTensorAccessorR const &output_grad);
// GenericTensorAccessorR const &bias);
Params get_params() const;

public:
Expand Down
52 changes: 27 additions & 25 deletions inference/models/llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,19 +93,20 @@ void LLAMA::create_llama_model(FFModel &ff,
}
att_norm->print("att_norm");
Tensor qkv_proj = ff.dense(
att_norm,
llama_config.hidden_size * 3, // q, k, v. need to change if want to remove replication. (q_heads + 2 * kv_heads) * proj_size
AC_MODE_NONE,
false, // seems like llama does not use bias
DT_NONE, // what is this
nullptr, // ?
nullptr, // ?
nullptr, // ?
REG_MODE_NONE, // no regularization
0.0f, // no dropout
std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj")
.c_str()
);
att_norm,
llama_config.hidden_size *
3, // q, k, v. need to change if want to remove replication.
// (q_heads + 2 * kv_heads) * proj_size
AC_MODE_NONE,
false, // seems like llama does not use bias
DT_NONE, // what is this
nullptr, // ?
nullptr, // ?
nullptr, // ?
REG_MODE_NONE, // no regularization
0.0f, // no dropout
std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj")
.c_str());
qkv_proj->print("qkv_proj");

Tensor mha;
Expand Down Expand Up @@ -189,18 +190,19 @@ void LLAMA::create_llama_model(FFModel &ff,

Tensor mha_input = mha;
mha_input->print("mha_input");
mha = ff.dense(mha_input,
llama_config.hidden_size,
AC_MODE_NONE,
false,
DT_NONE,
nullptr,
nullptr,
nullptr,
REG_MODE_NONE,
0.0f,
std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
.c_str());
mha = ff.dense(
mha_input,
llama_config.hidden_size,
AC_MODE_NONE,
false,
DT_NONE,
nullptr,
nullptr,
nullptr,
REG_MODE_NONE,
0.0f,
std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
.c_str());
mha->print("mha");

// step 2: SILU activaion
Expand Down
5 changes: 2 additions & 3 deletions src/ops/fused.cu
Original file line number Diff line number Diff line change
Expand Up @@ -457,8 +457,7 @@ __host__ void
bc,
task->index_point.point_data[0],
my_input_accessor[0],
my_output_accessor[0]
);
my_output_accessor[0]);
break;
}
case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
Expand Down Expand Up @@ -1042,7 +1041,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
my_input_grad_accessor[0],
// my_weight_accessor[0],
my_output_grad_accessor[0]);
// biases);
// biases);
break;
}
case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION:
Expand Down
25 changes: 15 additions & 10 deletions src/ops/inc_multihead_self_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -394,8 +394,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
dims[i] = _input->dims[i];
}
dims[0].size = _embed_dim;
// Currently require no parallelism along this dim, is this consistent with the
// removal of the previous assert?
// Currently require no parallelism along this dim, is this consistent with
// the removal of the previous assert?
assert(dims[0].degree == 1);
if (allocate_weights) {
// Create weight tensor
Expand Down Expand Up @@ -600,10 +600,13 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
attn->num_kv_heads / attn->tensor_parallelism_degree +
(attn->num_kv_heads % attn->tensor_parallelism_degree != 0);

if(attn->oProjSize != output.domain.hi()[0] - output.domain.lo()[0] + 1) {
printf("attn o_proj size %d does not match output domain %d\n", attn->oProjSize, output.domain.hi()[0] - output.domain.lo()[0] + 1);
if (attn->oProjSize != output.domain.hi()[0] - output.domain.lo()[0] + 1) {
printf("attn o_proj size %d does not match output domain %d\n",
attn->oProjSize,
output.domain.hi()[0] - output.domain.lo()[0] + 1);
}
// assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
// assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] +
// 1);

Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
Expand Down Expand Up @@ -709,7 +712,7 @@ void IncMultiHeadSelfAttention::inference_task(

GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);

Domain input_domain = runtime->get_index_space_domain(
Expand All @@ -724,7 +727,7 @@ void IncMultiHeadSelfAttention::inference_task(
assert(task->index_point.get_dim() == 1);

IncMultiHeadSelfAttention::inference_kernel_wrapper(
m, bc, task->index_point.point_data[0], input, output);
m, bc, task->index_point.point_data[0], input, output);

if (m->inference_debugging) {
assert(task->index_point.get_dim() == 1);
Expand Down Expand Up @@ -822,9 +825,11 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
// GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
// m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
// m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx,
// runtime);
// GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
// m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
// m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx,
// runtime);
GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
GenericTensorAccessorR biases;
Expand Down Expand Up @@ -862,7 +867,7 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
input_grad,
// weight,
output_grad);
// biases);
// biases);

if (m->inference_debugging) {
assert(task->index_point.get_dim() == 1);
Expand Down
2 changes: 1 addition & 1 deletion src/ops/inc_multihead_self_attention.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -938,7 +938,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
compute_qkv_kernel(m,
bc,
shard_id,
// input_ptr,
// input_ptr,
weight_ptr,
static_cast<DT *>(m->devQKVProjArray),
bias_ptr,
Expand Down
Loading

0 comments on commit 104ba3c

Please sign in to comment.