Skip to content

Commit

Permalink
style: format code
Browse files Browse the repository at this point in the history
  • Loading branch information
chenzhuofu committed Sep 5, 2024
1 parent 30d17a2 commit 61177ee
Show file tree
Hide file tree
Showing 9 changed files with 32 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,11 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
// For other Key tokens like in streaming cache, we nned other kernel to apply
// the position embedding.
template <typename DT>
void apply_pos_encoding_to_tokens_in_batch(IncMultiHeadSelfAttentionMeta const *m,
BatchConfig const *bc,
DT *output_ptr,
cudaStream_t stream);
void apply_pos_encoding_to_tokens_in_batch(
IncMultiHeadSelfAttentionMeta const *m,
BatchConfig const *bc,
DT *output_ptr,
cudaStream_t stream);

// [For the tokens in streaming cache]
// Apply position embedding for k projection in the streaming cache.
Expand Down
2 changes: 1 addition & 1 deletion inference/models/falcon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff,
1.0f, /*scaling factor*/
true, /*qk_prod_scaling*/
false, /*position_bias*/
false, /*streaming_cache*/
false, /*streaming_cache*/
std::string("layers_" + std::to_string(i) + "_attention")
.c_str() /*name*/
);
Expand Down
24 changes: 12 additions & 12 deletions inference/models/llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,18 +151,18 @@ void LLAMA::create_llama_model(FFModel &ff,
llama_config.num_key_value_heads,
llama_config.hidden_size / llama_config.num_attention_heads,
llama_config.hidden_size / llama_config.num_attention_heads,
0.0f, /*dropout*/
false, /*qkv_bias*/
false, /*final_bias*/
false, /*add_zero_attn*/
DT_NONE, /*data_type*/
nullptr, /*kernel_initializer*/
true, /*apply_rotary_embedding*/
false, /*scaling query*/
1.0f, /*scaling factor*/
true, /*qk_prod_scaling*/
false, /*position_bias*/
streaming_cache, /*streaming_cache*/
0.0f, /*dropout*/
false, /*qkv_bias*/
false, /*final_bias*/
false, /*add_zero_attn*/
DT_NONE, /*data_type*/
nullptr, /*kernel_initializer*/
true, /*apply_rotary_embedding*/
false, /*scaling query*/
1.0f, /*scaling factor*/
true, /*qk_prod_scaling*/
false, /*position_bias*/
streaming_cache, /*streaming_cache*/
std::string("layers_" + std::to_string(i) + "_attention")
.c_str() /*name*/
);
Expand Down
2 changes: 1 addition & 1 deletion inference/models/starcoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ void STARCODER::create_starcoder_model(
1.0f, /*scaling factor*/
true, /*qk_prod_scaling*/
false, /*position_bias*/
false, /*streaming_cache*/
false, /*streaming_cache*/
std::string("layers_" + std::to_string(i) + "_attention")
.c_str() /*name*/
);
Expand Down
5 changes: 3 additions & 2 deletions src/ops/kernels/inc_multihead_self_attention_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -689,8 +689,9 @@ __global__ void
pre_pos_enc_buf[to_k_idx + offset] =
static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
pre_pos_enc_buf[to_v_idx + offset] = static_cast<half>(
qkv_proj_array[from_idx + q_hidden_size + temp_kv_hidden_size + kv_offset]);
pre_pos_enc_buf[to_v_idx + offset] =
static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
temp_kv_hidden_size + kv_offset]);
}
template <typename DT>
Expand Down
2 changes: 1 addition & 1 deletion src/ops/spec_inc_multihead_self_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
position_bias(_position_bias) , streaming_cache(_streaming_cache) {
position_bias(_position_bias), streaming_cache(_streaming_cache) {
// overwrite layer_guid
layer_guid = _layer_guid;

Expand Down
6 changes: 2 additions & 4 deletions src/ops/tree_inc_multihead_self_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -400,10 +400,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
bias_ptr,
stream);
apply_pos_encoding_to_tokens_in_batch(m,
bc,
static_cast<DT *>(m->devQKVProjArray),
stream);
apply_pos_encoding_to_tokens_in_batch(
m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
// cudaEventRecord(t_end, stream);
// checkCUDA(cudaEventSynchronize(t_end));
Expand Down
10 changes: 5 additions & 5 deletions src/runtime/batch_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,14 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
for (int i = 0; i < bc.max_requests_per_batch(); i++) {
if (bc.request_available[i]) {
os << " Request " << i << ":\n";
os << " Sink cache size: "
<< bc.streamingCacheInfo[i].sink_cache_size << std::endl;
os << " Sink cache size: " << bc.streamingCacheInfo[i].sink_cache_size
<< std::endl;
os << " Window cache size: "
<< bc.streamingCacheInfo[i].window_cache_size << std::endl;
<< bc.streamingCacheInfo[i].window_cache_size << std::endl;
os << " Window back: " << bc.streamingCacheInfo[i].window_back
<< std::endl;
<< std::endl;
os << " Commit len: " << bc.streamingCacheInfo[i].commit_len
<< std::endl;
<< std::endl;
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/runtime/graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2809,7 +2809,8 @@ void FFModel::deserialize_graph_optimal_view(
tensor_parallelism_degree;
float dropout, scaling_factor;
bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
scaling_query, qk_prod_scaling, offload, streaming_cache, position_bias;
scaling_query, qk_prod_scaling, offload, streaming_cache,
position_bias;
DataType quantization_type;
size_t id, transformer_layer_id, deserialized_model_id;
dez.deserialize(id);
Expand Down

0 comments on commit 61177ee

Please sign in to comment.