diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index ee486ff9fe..5b2acba1bc 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -126,13 +126,14 @@ class IncMultiHeadSelfAttention : public Op {
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
                                        GenericTensorAccessorW const &output);
-  static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
-                                      BatchConfig const *bc,
-                                      int shard_id,
-                                      GenericTensorAccessorW const &input_grad,
-                                      // GenericTensorAccessorR const &weight,
-                                      GenericTensorAccessorR const &output_grad);
-                                      // GenericTensorAccessorR const &bias);
+  static void
+      peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
+                              BatchConfig const *bc,
+                              int shard_id,
+                              GenericTensorAccessorW const &input_grad,
+                              // GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorR const &output_grad);
+  // GenericTensorAccessorR const &bias);
   Params get_params() const;
 
 public:
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 8e8f225955..4b5a3f55ee 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -93,19 +93,20 @@ void LLAMA::create_llama_model(FFModel &ff,
     }
     att_norm->print("att_norm");
     Tensor qkv_proj = ff.dense(
-      att_norm,
-      llama_config.hidden_size * 3, // q, k, v. need to change if want to remove replication. (q_heads + 2 * kv_heads) * proj_size
-      AC_MODE_NONE,
-      false, // seems like llama does not use bias
-      DT_NONE, // what is this
-      nullptr, // ?
-      nullptr, // ?
-      nullptr, // ?
-      REG_MODE_NONE, // no regularization
-      0.0f, // no dropout
-      std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj")
-                     .c_str()
-    );
+        att_norm,
+        llama_config.hidden_size *
+            3, // q, k, v. need to change if want to remove replication.
+               // (q_heads + 2 * kv_heads) * proj_size
+        AC_MODE_NONE,
+        false,         // seems like llama does not use bias
+        DT_NONE,       // what is this
+        nullptr,       // ?
+        nullptr,       // ?
+        nullptr,       // ?
+        REG_MODE_NONE, // no regularization
+        0.0f,          // no dropout
+        std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj")
+            .c_str());
     qkv_proj->print("qkv_proj");
 
     Tensor mha;
@@ -189,18 +190,19 @@ void LLAMA::create_llama_model(FFModel &ff,
 
     Tensor mha_input = mha;
     mha_input->print("mha_input");
-    mha = ff.dense(mha_input,
-                   llama_config.hidden_size,
-                   AC_MODE_NONE,
-                   false,
-                   DT_NONE,
-                   nullptr,
-                   nullptr,
-                   nullptr,
-                   REG_MODE_NONE,
-                   0.0f,
-                   std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
-                       .c_str());
+    mha = ff.dense(
+        mha_input,
+        llama_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
+            .c_str());
     mha->print("mha");
 
     // step 2: SILU activaion
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 3463c3b235..76bfa89def 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -457,8 +457,7 @@ __host__ void
             bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_output_accessor[0]
-            );
+            my_output_accessor[0]);
         break;
       }
       case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -1042,7 +1041,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
             my_input_grad_accessor[0],
             // my_weight_accessor[0],
             my_output_grad_accessor[0]);
-            // biases);
+        // biases);
         break;
       }
       case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION:
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 92cbd65360..f00bddb661 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -394,8 +394,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     dims[i] = _input->dims[i];
   }
   dims[0].size = _embed_dim;
-  // Currently require no parallelism along this dim, is this consistent with the 
-  // removal of the previous assert?
+  // Currently require no parallelism along this dim, is this consistent with
+  // the removal of the previous assert?
   assert(dims[0].degree == 1);
   if (allocate_weights) {
     // Create weight tensor
@@ -600,10 +600,13 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
       attn->num_kv_heads / attn->tensor_parallelism_degree +
       (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
 
-  if(attn->oProjSize != output.domain.hi()[0] - output.domain.lo()[0] + 1) {
-    printf("attn o_proj size %d does not match output domain %d\n", attn->oProjSize, output.domain.hi()[0] - output.domain.lo()[0] + 1);
+  if (attn->oProjSize != output.domain.hi()[0] - output.domain.lo()[0] + 1) {
+    printf("attn o_proj size %d does not match output domain %d\n",
+           attn->oProjSize,
+           output.domain.hi()[0] - output.domain.lo()[0] + 1);
   }
-  // assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
+  // assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] +
+  // 1);
 
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
@@ -709,7 +712,7 @@ void IncMultiHeadSelfAttention::inference_task(
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-    GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   Domain input_domain = runtime->get_index_space_domain(
@@ -724,7 +727,7 @@ void IncMultiHeadSelfAttention::inference_task(
   assert(task->index_point.get_dim() == 1);
 
   IncMultiHeadSelfAttention::inference_kernel_wrapper(
-         m, bc, task->index_point.point_data[0], input, output);
+      m, bc, task->index_point.point_data[0], input, output);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -822,9 +825,11 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   // GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-  //     m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  //     m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx,
+  //     runtime);
   // GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
-  //     m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  //     m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx,
+  //     runtime);
   GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorR biases;
@@ -862,7 +867,7 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
       input_grad,
       // weight,
       output_grad);
-      // biases);
+  // biases);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 0ec9bf4ba5..c9b91e5f80 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -938,7 +938,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                    //  input_ptr,
+                     //  input_ptr,
                      weight_ptr,
                      static_cast<DT *>(m->devQKVProjArray),
                      bias_ptr,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index f89321554c..f6993e987a 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -538,7 +538,6 @@ __global__ void fill_entries_above_diagonal(DT *matrix,
   }
 }
 
-
 template <typename DT>
 void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
@@ -564,7 +563,6 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
 #endif
 
-
   int num_tokens = bc->num_active_tokens();
   int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
   size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
@@ -739,7 +737,7 @@ void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
-// this kernel is no longer used by the attention operator because 
+// this kernel is no longer used by the attention operator because
 // there's no more weights
 // TODO: check if this is needed by the projection layers?
 template <typename DT>
@@ -814,7 +812,8 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
 
   // phase 0: copy calculated qkv into devQKVProjArray
   // [qProjSize, num_heads, 3, num_new_tokens]
-  size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+  size_t qkv_proj_size =
+      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
 
   cudaMemcpyAsync(m->devQKVProjArray,
                   qkv_ptr,
@@ -826,11 +825,11 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                    //  input_ptr,
-                    //  weight_ptr,
-                    //  nullptr, // does not use weight
+                     //  input_ptr,
+                     //  weight_ptr,
+                     //  nullptr, // does not use weight
                      static_cast<DT *>(m->devQKVProjArray),
-                    //  bias_ptr,
+                     //  bias_ptr,
                      stream);
   update_kv_cache_kernel<DT>(m, bc, stream);
 
@@ -871,50 +870,79 @@ std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
   return dst_filepath.string();
 }
 
-__global__ void transposeAdd_half_kernel(half *out, const half *in, int width, int height, half alpha, half beta) {
-    int t_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int num_threads = blockDim.x * gridDim.x;
-    for(int i = t_id; i < width * height; i += num_threads) {
-        int row = i / width;
-        int col = i % width;
-        out[col * height + row] = alpha * in[row * width + col] + beta * out[col * height + row];
-    }
+__global__ void transposeAdd_half_kernel(
+    half *out, half const *in, int width, int height, half alpha, half beta) {
+  int t_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (int i = t_id; i < width * height; i += num_threads) {
+    int row = i / width;
+    int col = i % width;
+    out[col * height + row] =
+        alpha * in[row * width + col] + beta * out[col * height + row];
+  }
 }
 
-__global__ void transposeAdd_float_kernel(float *out, const float *in, int width, int height, float alpha, float beta) {
-    int t_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int num_threads = blockDim.x * gridDim.x;
-    for(int i = t_id; i < width * height; i += num_threads) {
-        int row = i / width;
-        int col = i % width;
-        out[col * height + row] = alpha * in[row * width + col] + beta * out[col * height + row];
-    }
+__global__ void transposeAdd_float_kernel(float *out,
+                                          float const *in,
+                                          int width,
+                                          int height,
+                                          float alpha,
+                                          float beta) {
+  int t_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (int i = t_id; i < width * height; i += num_threads) {
+    int row = i / width;
+    int col = i % width;
+    out[col * height + row] =
+        alpha * in[row * width + col] + beta * out[col * height + row];
+  }
 }
 
 template <typename DT>
-void transposeAdd(DT *out, const DT *in, int width, int height, float alpha, float beta, cudaStream_t stream) {
-    assert(false && "Unsupported data type");
+void transposeAdd(DT *out,
+                  const DT *in,
+                  int width,
+                  int height,
+                  float alpha,
+                  float beta,
+                  cudaStream_t stream) {
+  assert(false && "Unsupported data type");
 }
 
-template<>
-void transposeAdd<float>(float *out, const float *in, int width, int height, float alpha, float beta, cudaStream_t stream) {
-    transposeAdd_float_kernel<<<4, 1024, 0, stream>>>(out, in, width, height, alpha, beta);
+template <>
+void transposeAdd<float>(float *out,
+                         float const *in,
+                         int width,
+                         int height,
+                         float alpha,
+                         float beta,
+                         cudaStream_t stream) {
+  transposeAdd_float_kernel<<<4, 1024, 0, stream>>>(
+      out, in, width, height, alpha, beta);
 }
 
-template<>
-void transposeAdd<half>(half *out, const half *in, int width, int height, float alpha, float beta, cudaStream_t stream) {
-    transposeAdd_half_kernel<<<4, 1024, 0, stream>>>(out, in, width, height, __float2half(alpha), __float2half(beta));
+template <>
+void transposeAdd<half>(half *out,
+                        half const *in,
+                        int width,
+                        int height,
+                        float alpha,
+                        float beta,
+                        cudaStream_t stream) {
+  transposeAdd_half_kernel<<<4, 1024, 0, stream>>>(
+      out, in, width, height, __float2half(alpha), __float2half(beta));
 }
 
 template <typename DT>
-void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                     BatchConfig const *bc,
-                     int shard_id,
-                     DT *input_grad_ptr,
-                     DT const *weight_ptr, // this is unused, kept for consistency
-                     DT const *output_grad_ptr,
-                     DT const *bias_ptr,
-                     cudaStream_t stream) {
+void peft_bwd_kernel(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    DT *input_grad_ptr,
+    DT const *weight_ptr, // this is unused, kept for consistency
+    DT const *output_grad_ptr,
+    DT const *bias_ptr,
+    cudaStream_t stream) {
   assert(!m->offload);
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
@@ -1327,12 +1355,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       int n_ = num_tokens;
       int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
 
-      // TODO: checkout if the input grad ptr has some relation with m->devQKVProjArray
-      // so we may potentially skip this transpose and copy
-      // TODO: check if this transposeAdd can correctly implement gradient accumulation
+      // TODO: checkout if the input grad ptr has some relation with
+      // m->devQKVProjArray so we may potentially skip this transpose and copy
+      // TODO: check if this transposeAdd can correctly implement gradient
+      // accumulation
       transposeAdd(C, B, n_, k_, alpha, beta, stream);
-      
-      // printf("backward of raw attn grad: %d, %d, with redudant dimension %d\n", k_, n_, m_);
+
+      // printf("backward of raw attn grad: %d, %d, with redudant dimension
+      // %d\n", k_, n_, m_);
       if (m->inference_debugging) {
         std::string filename =
             get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
@@ -1685,7 +1715,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     // GenericTensorAccessorR const &weight,
     GenericTensorAccessorW const &output
     // GenericTensorAccessorR const &bias
-    ) {
+) {
   // printf("inf_k_warpper start\n");
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -1710,7 +1740,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         bc,
         shard_id,
         input.get_half_ptr(),
-        static_cast<half const *>(nullptr), //weight_ptr is no longer used
+        static_cast<half const *>(nullptr), // weight_ptr is no longer used
         output.get_half_ptr(),
         static_cast<half const *>(nullptr), // bias_ptr is no longer used
         stream);
@@ -1720,7 +1750,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         bc,
         shard_id,
         input.get_float_ptr(),
-        static_cast<float const *>(nullptr), //weight_ptr is no longer used
+        static_cast<float const *>(nullptr), // weight_ptr is no longer used
         output.get_float_ptr(),
         static_cast<float const *>(nullptr), // bias_ptr is no longer used
         stream);
@@ -1747,7 +1777,7 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
     GenericTensorAccessorW const &input_grad,
     // GenericTensorAccessorR const &weight,
     GenericTensorAccessorR const &output_grad) {
-    // GenericTensorAccessorR const &bias) {
+  // GenericTensorAccessorR const &bias) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   bool use_bias = *m->qkv_bias || *m->final_bias;
@@ -1769,30 +1799,33 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
     assert(!m->offload);
     // half const *bias_ptr =
     //     use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
-    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
-                                                    bc,
-                                                    shard_id,
-                                                    input_grad.get_half_ptr(),
-                                                    // weight.get_half_ptr(),
-                                                    static_cast<half const *>(nullptr),
-                                                    output_grad.get_half_ptr(),
-                                                    // bias_ptr,
-                                                    static_cast<half const *>(nullptr),
-                                                    stream);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(
+        m,
+        bc,
+        shard_id,
+        input_grad.get_half_ptr(),
+        // weight.get_half_ptr(),
+        static_cast<half const *>(nullptr),
+        output_grad.get_half_ptr(),
+        // bias_ptr,
+        static_cast<half const *>(nullptr),
+        stream);
   } else if (input_grad.data_type == DT_FLOAT) {
     assert(!m->offload);
     // float const *bias_ptr =
-    //     use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
-                                                    bc,
-                                                    shard_id,
-                                                    input_grad.get_float_ptr(),
-                                                    // weight.get_float_ptr(),
-                                                    static_cast<float const *>(nullptr),
-                                                    output_grad.get_float_ptr(),
-                                                    // bias_ptr,
-                                                    static_cast<float const *>(nullptr),
-                                                    stream);
+    //     use_bias ? bias.get_float_ptr() : static_cast<float const
+    //     *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(
+        m,
+        bc,
+        shard_id,
+        input_grad.get_float_ptr(),
+        // weight.get_float_ptr(),
+        static_cast<float const *>(nullptr),
+        output_grad.get_float_ptr(),
+        // bias_ptr,
+        static_cast<float const *>(nullptr),
+        stream);
   } else {
     assert(false && "Unspported data type");
   }
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index ee7dd9f4e7..29dc969687 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -631,7 +631,8 @@ void peft_bwd_kernel(LinearMeta const *m,
                            in_dim,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      // printf("%s: input_grad has shape %d, %d\n", m->op_name, in_dim, num_peft_tokens);
+    // printf("%s: input_grad has shape %d, %d\n", m->op_name, in_dim,
+    // num_peft_tokens);
   }
 }
 
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 45d85f6f39..88a3d2e3e4 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -779,7 +779,8 @@ void Linear::peft_bwd_task(Task const *task,
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    printf("%s: in_dim = %d, out_dim = %d, num_infr_tokens = %d, num_peft_tokens = %d, volume = %d\n",
+    printf("%s: in_dim = %d, out_dim = %d, num_infr_tokens = %d, "
+           "num_peft_tokens = %d, volume = %d\n",
            m->op_name,
            in_dim,
            out_dim,
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 4cd54763ec..bd7f1624ae 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -705,14 +705,14 @@ void SpecIncMultiHeadSelfAttention::inference_task(
 
   SpecIncMultiHeadSelfAttentionMeta *m =
       *((SpecIncMultiHeadSelfAttentionMeta **)task->local_args);
-  assert(regions.size() ==2);
+  assert(regions.size() == 2);
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorR biases;
-  
+
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
   Domain output_domain = runtime->get_index_space_domain(
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index b48c4bf734..0bf2b3346e 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -501,17 +501,19 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       stream));
   // phase 0: copy calculated qkv into devQKVProjArray
   // [qProjSize, num_heads, 3, num_new_tokens]
-  size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+  size_t qkv_proj_size =
+      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
 
   cudaMemcpyAsync(m->devQKVProjArray,
                   qkv_ptr,
-                  qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+                  qkv_proj_size *
+                      sizeof(DT), // is this right, do we need layers etc here
                   cudaMemcpyDeviceToDevice,
                   stream);
 
   // phase 1: Implement kernel to compute KQV for input tokens
-  // TODO WARNING: this is commented out only because we are fixing the inc_attn first
-  // compute_qkv_kernel(m,
+  // TODO WARNING: this is commented out only because we are fixing the inc_attn
+  // first compute_qkv_kernel(m,
   //                    bc,
   //                    shard_id,
   //                   //  input_ptr,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 6144b9bd4c..30cbdc6b10 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -706,22 +706,25 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 
   // phase 0: copy calculated qkv into devQKVProjArray
   // [qProjSize, num_heads, 3, num_new_tokens]
-  size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+  size_t qkv_proj_size =
+      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
 
   cudaMemcpyAsync(m->devQKVProjArray,
                   qkv_ptr,
-                  qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+                  qkv_proj_size *
+                      sizeof(DT), // is this right, do we need layers etc here
                   cudaMemcpyDeviceToDevice,
                   stream);
   // phase 1: Implement kernel to compute KQV for input tokens
-  // TODO WARNING: this is commented out only because we are fixing the inc_attn first
+  // TODO WARNING: this is commented out only because we are fixing the inc_attn
+  // first
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                    //  input_ptr,
-                    //  weight_ptr,
+                     //  input_ptr,
+                     //  weight_ptr,
                      static_cast<DT *>(m->devQKVProjArray),
-                    //  bias_ptr,
+                     //  bias_ptr,
                      stream);
   // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index a3f6757df3..4564ca6cc2 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -159,7 +159,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   int one_head_size = qParas + kParas + vParas + oParas;
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
                     vParas * num_q_heads + oParas * num_q_heads;
-  
+
   li->data_type = data_type;
   li->add_int_property("embed_dim", embed_dim);
   li->add_int_property("num_q_heads", num_q_heads);
@@ -392,7 +392,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     dims[i] = _input->dims[i];
   }
   dims[0].size = _embed_dim;
-  // Currently require no parallelism along this dim, is this aligned with the previous removal of assert?
+  // Currently require no parallelism along this dim, is this aligned with the
+  // previous removal of assert?
   assert(dims[0].degree == 1);
   if (allocate_weights) {
     // Create weight tensor
@@ -597,10 +598,13 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
   int num_kv_heads =
       attn->num_kv_heads / attn->tensor_parallelism_degree +
       (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
-  if(attn->oProjSize != output.domain.hi()[0] - output.domain.lo()[0] + 1) {
-    std::cout<<"attn->oProjSize: "<<attn->oProjSize<<" does not match output domain dim[0]: "<<output.domain.hi()[0] - output.domain.lo()[0] + 1<<std::endl;
+  if (attn->oProjSize != output.domain.hi()[0] - output.domain.lo()[0] + 1) {
+    std::cout << "attn->oProjSize: " << attn->oProjSize
+              << " does not match output domain dim[0]: "
+              << output.domain.hi()[0] - output.domain.lo()[0] + 1 << std::endl;
   }
-  // assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
+  // assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] +
+  // 1);
 
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 585bf3fa46..ff592ddccb 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -936,8 +936,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
   // phase 1: Implement kernel to compute KQV for input tokens
-  // TODO WARNING: this is commented out only because we are fixing the inc_attn first
-  // compute_qkv_kernel(m,
+  // TODO WARNING: this is commented out only because we are fixing the inc_attn
+  // first compute_qkv_kernel(m,
   //                    bc,
   //                    shard_id,
   //                   //  input_ptr,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 9619070737..c2ba0ecbde 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -916,23 +916,26 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   }
   // phase 0: copy calculated qkv into devQKVProjArray
   // [qProjSize, num_heads, 3, num_new_tokens]
-  size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+  size_t qkv_proj_size =
+      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
 
   cudaMemcpyAsync(m->devQKVProjArray,
                   qkv_ptr,
-                  qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+                  qkv_proj_size *
+                      sizeof(DT), // is this right, do we need layers etc here
                   cudaMemcpyDeviceToDevice,
                   stream);
 
   // phase 1: Implement kernel to compute KQV for input tokens
-  // TODO WARNING: this is commented out only because we are fixing the inc_attn first
+  // TODO WARNING: this is commented out only because we are fixing the inc_attn
+  // first
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                    //  input_ptr,
-                    //  weight_ptr,
+                     //  input_ptr,
+                     //  weight_ptr,
                      static_cast<DT *>(m->devQKVProjArray),
-                    //  bias_ptr,
+                     //  bias_ptr,
                      stream);
 
   // phase 2: No need to update key/val cache
@@ -985,25 +988,23 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
   assert(input.data_type == output.data_type);
 
   if (input.data_type == DT_HALF) {
-    Kernels::TreeIncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_half_ptr(),
-        (half*)nullptr,
-        output.get_half_ptr(),
-        (half*)nullptr,
-        stream);
+    Kernels::TreeIncMultiHeadAttention::inference_kernel(m,
+                                                         bc,
+                                                         shard_id,
+                                                         input.get_half_ptr(),
+                                                         (half *)nullptr,
+                                                         output.get_half_ptr(),
+                                                         (half *)nullptr,
+                                                         stream);
   } else if (input.data_type == DT_FLOAT) {
-    Kernels::TreeIncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_float_ptr(),
-        (float*)nullptr,
-        output.get_float_ptr(),
-        (float*)nullptr,
-        stream);
+    Kernels::TreeIncMultiHeadAttention::inference_kernel(m,
+                                                         bc,
+                                                         shard_id,
+                                                         input.get_float_ptr(),
+                                                         (float *)nullptr,
+                                                         output.get_float_ptr(),
+                                                         (float *)nullptr,
+                                                         stream);
   } else {
     assert(false && "Unspported data type");
   }
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 0cb12e3b0e..9a6c561f18 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -129,12 +129,12 @@ void load_attention_weights_multi_query(DT *ptr,
 
 template <typename DT>
 void load_attention_o_proj_bias_to_dense_v2(DT *ptr,
-                            int num_heads,
-                            int num_kv_heads,
-                            size_t hidden_dim,
-                            size_t qkv_inner_dim,
-                            std::string layer_name,
-                            std::string weights_folder) {
+                                            int num_heads,
+                                            int num_kv_heads,
+                                            size_t hidden_dim,
+                                            size_t qkv_inner_dim,
+                                            std::string layer_name,
+                                            std::string weights_folder) {
   std::string filename = layer_name + ".o_proj.bias";
 
   int file_index = 0;
@@ -262,15 +262,15 @@ void load_attention_bias_v2(DT *ptr,
 
 template <typename DT>
 void load_attention_weights_to_dense_v2(DT *ptr,
-                               int num_heads,
-                               int num_kv_heads,
-                               size_t hidden_dim,
-                               size_t qkv_inner_dim,
-                               std::string layer_name,
-                               std::string weights_folder,
-                               size_t volume,
-                               int tensor_parallelism_degree,
-                               bool load_o_proj) {
+                                        int num_heads,
+                                        int num_kv_heads,
+                                        size_t hidden_dim,
+                                        size_t qkv_inner_dim,
+                                        std::string layer_name,
+                                        std::string weights_folder,
+                                        size_t volume,
+                                        int tensor_parallelism_degree,
+                                        bool load_o_proj) {
   // layers_0_attention_wq_weight
   // layers_0_self_attn_q_proj_weight
   std::string q_file = layer_name + ".q_proj.weight";
@@ -299,9 +299,10 @@ void load_attention_weights_to_dense_v2(DT *ptr,
   // stride for q, k, v, o
   size_t stride_size = (q_size + v_replicate_size + k_replicate_size) /
                        tensor_parallelism_degree;
-  if(!load_o_proj) {
+  if (!load_o_proj) {
     for (auto filename : weight_filenames) {
-      std::cout << "Loading weight file " << filename << " to dense"<< std::endl;
+      std::cout << "Loading weight file " << filename << " to dense"
+                << std::endl;
       std::string weight_filepath = join_path({weights_folder, filename});
 
       int data_index = 0;
@@ -342,17 +343,18 @@ void load_attention_weights_to_dense_v2(DT *ptr,
           int head_idx = i % (num_heads / tensor_parallelism_degree);
           int tp_idx = (i / (num_heads / tensor_parallelism_degree));
           for (int j = 0; j < single_proj_size; j++) {
-            ptr[base_index + tp_idx * stride_size + single_proj_size * head_idx +
-                j] = host_array.at(kv_idx * single_proj_size + j);
+            ptr[base_index + tp_idx * stride_size +
+                single_proj_size * head_idx + j] =
+                host_array.at(kv_idx * single_proj_size + j);
           }
         }
       }
-      std::cout<<"host array going out of scope, releasing"<<endl;
+      std::cout << "host array going out of scope, releasing" << endl;
       base_index += one_partition_size;
       file_index++;
     }
     assert(base_index == (q_size + k_replicate_size + v_replicate_size) /
-                            tensor_parallelism_degree);
+                             tensor_parallelism_degree);
   } else {
     std::cout << "Loading weight file " << o_file << std::endl;
     std::string weight_filepath = join_path({weights_folder, o_file});
@@ -907,16 +909,17 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
 
   // dense layers for attention projection is named as
   // self_attn.qkv_proj or self_attn.o_proj
-  // so looking for self_attn. in the name can determine if it is an attention projection
+  // so looking for self_attn. in the name can determine if it is an attention
+  // projection
   if (weight_filename.find("self_attn.") != std::string::npos) {
     size_t pos = weight_filename.find(".o_proj");
     if (pos != std::string::npos) {
-        weight_filename.replace(pos, std::string(".o_proj").length(), "");
-        is_o_proj = true;
+      weight_filename.replace(pos, std::string(".o_proj").length(), "");
+      is_o_proj = true;
     } else {
       pos = weight_filename.find(".qkv_proj");
-      if(pos == std::string::npos) {
-        cout<<weight_filename<<endl;
+      if (pos == std::string::npos) {
+        cout << weight_filename << endl;
       }
       assert(pos != std::string::npos);
       weight_filename.replace(pos, std::string(".qkv_proj").length(), "");
@@ -924,7 +927,6 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     is_attn_proj = true;
   }
 
-
   if (ff->config.benchmarking) {
     std::cout << "Initializing weight " << weight_filename
               << " with random data (benchmarking mode)" << std::endl;
@@ -957,9 +959,9 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
       //                          weight_filename,
       //                          weights_folder);
       // }
-    } else if(is_attn_proj) {
-      if(is_o_proj) {
-        if(weight_idx == 0) {
+    } else if (is_attn_proj) {
+      if (is_o_proj) {
+        if (weight_idx == 0) {
           load_attention_weights_to_dense_v2(data,
                                              num_heads,
                                              num_kv_heads,
@@ -978,10 +980,9 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
                                                  qkv_inner_dim,
                                                  weight_filename,
                                                  weights_folder);
-        
         }
       } else {
-        if(weight_idx == 0) {
+        if (weight_idx == 0) {
           load_attention_weights_to_dense_v2(data,
                                              num_heads,
                                              num_kv_heads,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 40d4ca9766..e3bc433302 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1154,19 +1154,25 @@ bool Op::check_output_input_weight_same_parallel_is() const {
   IndexSpace parallel_is = outputs[0]->parallel_is;
   for (int i = 0; i < numOutputs; i++) {
     if (outputs[i]->parallel_is != parallel_is) {
-      std::cout<<"outputs["<<i<<"] has different parallel_is "<<outputs[i]->parallel_is<<" than output[0] "<<parallel_is<<std::endl;
+      std::cout << "outputs[" << i << "] has different parallel_is "
+                << outputs[i]->parallel_is << " than output[0] " << parallel_is
+                << std::endl;
       return false;
     }
   }
   for (int i = 0; i < numInputs; i++) {
     if (inputs[i]->parallel_is != parallel_is) {
-      std::cout<<"inputs["<<i<<"] has different parallel_is "<<inputs[i]->parallel_is<<" than output[0] "<<parallel_is<<std::endl;
+      std::cout << "inputs[" << i << "] has different parallel_is "
+                << inputs[i]->parallel_is << " than output[0] " << parallel_is
+                << std::endl;
       return false;
     }
   }
   for (int i = 0; i < numWeights; i++) {
     if (weights[i]->parallel_is != parallel_is) {
-      std::cout<<"weights["<<i<<"] has different parallel_is "<<weights[i]->parallel_is<<" than output[0] "<<parallel_is<<std::endl;
+      std::cout << "weights[" << i << "] has different parallel_is "
+                << weights[i]->parallel_is << " than output[0] " << parallel_is
+                << std::endl;
       return false;
     }
   }
@@ -3416,27 +3422,28 @@ bool FFModel::need_to_add_allreduce(int layer_idx) const {
   if (config.computationMode == COMP_MODE_INFERENCE &&
       config.tensor_parallelism_degree > 1 &&
       (
-      //  l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
-      //  l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
-       (std::string(l->name).find(".self_attn.o_proj") != std::string::npos) ||
-       // mlp layer
-       is_mlp_block(layer_idx) ||
-       // llama mlp layer
-       (l->op_type == OP_LINEAR && layer_idx >= 2 &&
-        layers[layer_idx - 1]->op_type == OP_GELU &&
-        layers[layer_idx - 2]->op_type == OP_LINEAR) ||
-       // LLAMA without element-wise operator fusion
-       (l->op_type == OP_LINEAR && layer_idx >= 5 &&
-        layers[layer_idx - 1]->op_type == OP_EW_MUL &&
-        layers[layer_idx - 2]->op_type == OP_EW_MUL &&
-        layers[layer_idx - 3]->op_type == OP_SIGMOID &&
-        layers[layer_idx - 4]->op_type == OP_LINEAR &&
-        layers[layer_idx - 5]->op_type == OP_LINEAR) ||
-       // LLAMA with element-wise operator fusion
-       (l->op_type == OP_LINEAR && layer_idx >= 3 &&
-        layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI &&
-        layers[layer_idx - 2]->op_type == OP_LINEAR &&
-        layers[layer_idx - 3]->op_type == OP_LINEAR))) {
+          //  l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
+          //  l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
+          (std::string(l->name).find(".self_attn.o_proj") !=
+           std::string::npos) ||
+          // mlp layer
+          is_mlp_block(layer_idx) ||
+          // llama mlp layer
+          (l->op_type == OP_LINEAR && layer_idx >= 2 &&
+           layers[layer_idx - 1]->op_type == OP_GELU &&
+           layers[layer_idx - 2]->op_type == OP_LINEAR) ||
+          // LLAMA without element-wise operator fusion
+          (l->op_type == OP_LINEAR && layer_idx >= 5 &&
+           layers[layer_idx - 1]->op_type == OP_EW_MUL &&
+           layers[layer_idx - 2]->op_type == OP_EW_MUL &&
+           layers[layer_idx - 3]->op_type == OP_SIGMOID &&
+           layers[layer_idx - 4]->op_type == OP_LINEAR &&
+           layers[layer_idx - 5]->op_type == OP_LINEAR) ||
+          // LLAMA with element-wise operator fusion
+          (l->op_type == OP_LINEAR && layer_idx >= 3 &&
+           layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI &&
+           layers[layer_idx - 2]->op_type == OP_LINEAR &&
+           layers[layer_idx - 3]->op_type == OP_LINEAR))) {
     return true;
   }
   return false;
diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc
index 52f192902b..d5bfcfc48e 100644
--- a/src/runtime/operator.cc
+++ b/src/runtime/operator.cc
@@ -2,8 +2,8 @@
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/simulator.h"
 #include <stdexcept>
-#include <wordexp.h>
 #include <unistd.h>
+#include <wordexp.h>
 
 namespace FlexFlow {
 
@@ -33,11 +33,12 @@ fs::path get_dst_folder(std::string const &subdir,
   char cwd[PATH_MAX];
   getcwd(cwd, sizeof(cwd));
 
-  // char const *ff_cache_path = std::string(std::getenv("FF_DEBUG_PATH")) == "." ?
+  // char const *ff_cache_path = std::string(std::getenv("FF_DEBUG_PATH")) ==
+  // "." ?
   //     cwd : std::getenv("FF_DEBUG_PATH");
 
   char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
-  
+
   std::string debug_dir_ =
       ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow"
                     : std::string("~/.cache/flexflow/debug/flexflow");
@@ -46,7 +47,7 @@ fs::path get_dst_folder(std::string const &subdir,
   debug_dir_ = p.we_wordv[0];
   wordfree(&p);
   fs::path debug_dir = debug_dir_;
-  if(!fs::is_directory(debug_dir)) {
+  if (!fs::is_directory(debug_dir)) {
     printf("invalid debug directory: %s\n", debug_dir.c_str());
   }
   assert(fs::is_directory(debug_dir));