diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 1b6c49a3d6..8f69ad3805 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -62,10 +62,11 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m, // For other Key tokens like in streaming cache, we nned other kernel to apply // the position embedding. template -void apply_pos_encoding_to_tokens_in_batch(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - DT *output_ptr, - cudaStream_t stream); +void apply_pos_encoding_to_tokens_in_batch( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream); // [For the tokens in streaming cache] // Apply position embedding for k projection in the streaming cache. diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 24c63ea0ea..96e85177cc 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -167,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - false, /*streaming_cache*/ + false, /*streaming_cache*/ std::string("layers_" + std::to_string(i) + "_attention") .c_str() /*name*/ ); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 64e54ae6b5..16dc2441ff 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -151,18 +151,18 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - streaming_cache, /*streaming_cache*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + streaming_cache, /*streaming_cache*/ std::string("layers_" + std::to_string(i) + "_attention") .c_str() /*name*/ ); diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 55faec3a77..f531fe9884 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -124,7 +124,7 @@ void STARCODER::create_starcoder_model( 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - false, /*streaming_cache*/ + false, /*streaming_cache*/ std::string("layers_" + std::to_string(i) + "_attention") .c_str() /*name*/ ); diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu index d260dc50a2..e65f2c0609 100644 --- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu +++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu @@ -689,8 +689,9 @@ __global__ void pre_pos_enc_buf[to_k_idx + offset] = static_cast(qkv_proj_array[from_idx + q_hidden_size + kv_offset]); - pre_pos_enc_buf[to_v_idx + offset] = static_cast( - qkv_proj_array[from_idx + q_hidden_size + temp_kv_hidden_size + kv_offset]); + pre_pos_enc_buf[to_v_idx + offset] = + static_cast(qkv_proj_array[from_idx + q_hidden_size + + temp_kv_hidden_size + kv_offset]); } template diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 5817bd1c40..cfcf783e93 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -296,7 +296,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( o_dim(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), - position_bias(_position_bias) , streaming_cache(_streaming_cache) { + position_bias(_position_bias), streaming_cache(_streaming_cache) { // overwrite layer_guid layer_guid = _layer_guid; diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 5898f558af..8c384c1b05 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -400,10 +400,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, bias_ptr, stream); - apply_pos_encoding_to_tokens_in_batch(m, - bc, - static_cast
(m->devQKVProjArray), - stream); + apply_pos_encoding_to_tokens_in_batch( + m, bc, static_cast
(m->devQKVProjArray), stream); // cudaEventRecord(t_end, stream); // checkCUDA(cudaEventSynchronize(t_end)); diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 89c642d485..308f468f53 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -166,14 +166,14 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (bc.request_available[i]) { os << " Request " << i << ":\n"; - os << " Sink cache size: " - << bc.streamingCacheInfo[i].sink_cache_size << std::endl; + os << " Sink cache size: " << bc.streamingCacheInfo[i].sink_cache_size + << std::endl; os << " Window cache size: " - << bc.streamingCacheInfo[i].window_cache_size << std::endl; + << bc.streamingCacheInfo[i].window_cache_size << std::endl; os << " Window back: " << bc.streamingCacheInfo[i].window_back - << std::endl; + << std::endl; os << " Commit len: " << bc.streamingCacheInfo[i].commit_len - << std::endl; + << std::endl; } } diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index ab6421d58f..ca8e51d40f 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2809,7 +2809,8 @@ void FFModel::deserialize_graph_optimal_view( tensor_parallelism_degree; float dropout, scaling_factor; bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, offload, streaming_cache, position_bias; + scaling_query, qk_prod_scaling, offload, streaming_cache, + position_bias; DataType quantization_type; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id);