From 61177ee2568ff05005ba48ea10b80709c0b92934 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Sep 2024 22:06:14 -0700
Subject: [PATCH] style: format code

---
 .../inc_multihead_self_attention_kernels.h    |  9 +++----
 inference/models/falcon.cc                    |  2 +-
 inference/models/llama.cc                     | 24 +++++++++----------
 inference/models/starcoder.cc                 |  2 +-
 .../inc_multihead_self_attention_kernels.cu   |  5 ++--
 src/ops/spec_inc_multihead_self_attention.cc  |  2 +-
 src/ops/tree_inc_multihead_self_attention.cu  |  6 ++---
 src/runtime/batch_config.cc                   | 10 ++++----
 src/runtime/graph.cc                          |  3 ++-
 9 files changed, 32 insertions(+), 31 deletions(-)
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 1b6c49a3d6..8f69ad3805 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -62,10 +62,11 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
 // For other Key tokens like in streaming cache, we nned other kernel to apply
 // the position embedding.
 template <typename DT>
-void apply_pos_encoding_to_tokens_in_batch(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        DT *output_ptr,
-                        cudaStream_t stream);
+void apply_pos_encoding_to_tokens_in_batch(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    DT *output_ptr,
+    cudaStream_t stream);
 
 // [For the tokens in streaming cache]
 // Apply position embedding for k projection in the streaming cache.
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 24c63ea0ea..96e85177cc 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -167,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            false,  /*streaming_cache*/
+            false,   /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 64e54ae6b5..16dc2441ff 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -151,18 +151,18 @@ void LLAMA::create_llama_model(FFModel &ff,
             llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
-            0.0f,    /*dropout*/
-            false,   /*qkv_bias*/
-            false,   /*final_bias*/
-            false,   /*add_zero_attn*/
-            DT_NONE, /*data_type*/
-            nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
-            streaming_cache,   /*streaming_cache*/
+            0.0f,            /*dropout*/
+            false,           /*qkv_bias*/
+            false,           /*final_bias*/
+            false,           /*add_zero_attn*/
+            DT_NONE,         /*data_type*/
+            nullptr,         /*kernel_initializer*/
+            true,            /*apply_rotary_embedding*/
+            false,           /*scaling query*/
+            1.0f,            /*scaling factor*/
+            true,            /*qk_prod_scaling*/
+            false,           /*position_bias*/
+            streaming_cache, /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 55faec3a77..f531fe9884 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -124,7 +124,7 @@ void STARCODER::create_starcoder_model(
             1.0f,                        /*scaling factor*/
             true,                        /*qk_prod_scaling*/
             false,                       /*position_bias*/
-            false,                      /*streaming_cache*/
+            false,                       /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index d260dc50a2..e65f2c0609 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -689,8 +689,9 @@ __global__ void
 
   pre_pos_enc_buf[to_k_idx + offset] =
       static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
-  pre_pos_enc_buf[to_v_idx + offset] = static_cast<half>(
-      qkv_proj_array[from_idx + q_hidden_size + temp_kv_hidden_size + kv_offset]);
+  pre_pos_enc_buf[to_v_idx + offset] =
+      static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
+                                       temp_kv_hidden_size + kv_offset]);
 }
 
 template <typename DT>
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 5817bd1c40..cfcf783e93 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -296,7 +296,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
       scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
-      position_bias(_position_bias) , streaming_cache(_streaming_cache) {
+      position_bias(_position_bias), streaming_cache(_streaming_cache) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 5898f558af..8c384c1b05 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -400,10 +400,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
               bias_ptr,
               stream);
 
-  apply_pos_encoding_to_tokens_in_batch(m,
-                     bc,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     stream);
+  apply_pos_encoding_to_tokens_in_batch(
+      m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 89c642d485..308f468f53 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -166,14 +166,14 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   for (int i = 0; i < bc.max_requests_per_batch(); i++) {
     if (bc.request_available[i]) {
       os << "  Request " << i << ":\n";
-      os << "    Sink cache size: "
-          << bc.streamingCacheInfo[i].sink_cache_size << std::endl;
+      os << "    Sink cache size: " << bc.streamingCacheInfo[i].sink_cache_size
+         << std::endl;
       os << "    Window cache size: "
-          << bc.streamingCacheInfo[i].window_cache_size << std::endl;
+         << bc.streamingCacheInfo[i].window_cache_size << std::endl;
       os << "    Window back: " << bc.streamingCacheInfo[i].window_back
-          << std::endl;
+         << std::endl;
       os << "    Commit len: " << bc.streamingCacheInfo[i].commit_len
-          << std::endl;
+         << std::endl;
     }
   }
 
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index ab6421d58f..ca8e51d40f 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2809,7 +2809,8 @@ void FFModel::deserialize_graph_optimal_view(
             tensor_parallelism_degree;
         float dropout, scaling_factor;
         bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, offload, streaming_cache, position_bias;
+            scaling_query, qk_prod_scaling, offload, streaming_cache,
+            position_bias;
         DataType quantization_type;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);