From 281a8bf6c2fb9634c61fd56ff319a0718cf25ba8 Mon Sep 17 00:00:00 2001
From: Yingcheng Wang <yingchen21@mails.tsinghua.edu.cn>
Date: Wed, 18 Sep 2024 21:13:34 +0000
Subject: [PATCH] Bug fixes, uploaded missing cpp implmentation

---
 inference/models/falcon.cc          |  6 ++--
 inference/models/mpt.cc             | 44 ++++++++++++++++++++++++-----
 inference/models/opt.cc             | 44 ++++++++++++++++++++++++-----
 inference/models/starcoder.cc       | 35 +++++++++++++++++++++--
 python/flexflow/serve/models/mpt.py |  6 ++--
 src/runtime/file_loader.cc          |  2 +-
 6 files changed, 114 insertions(+), 23 deletions(-)

diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 3def3bb847..e6eb72701e 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -104,14 +104,14 @@ void FALCON::create_falcon_model(FFModel &ff,
             3, // q, k, v. need to change if want to remove replication.
                // (q_heads + 2 * kv_heads) * proj_size
         AC_MODE_NONE,
-        false,         // seems like llama does not use bias
+        false,         // seems like it does not use bias
         DT_NONE,       // what is this
         nullptr,       // ?
         nullptr,       // ?
         nullptr,       // ?
         REG_MODE_NONE, // no regularization
         0.0f,          // no dropout
-        std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj")
+        std::string("layers." + std::to_string(i) + ".self_attention.qkv_proj")
             .c_str());
     qkv_proj->print("qkv_proj");
 
@@ -206,7 +206,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
+        std::string("layers." + std::to_string(i) + ".self_attention.o_proj")
             .c_str());
     mha->print("mha");
 
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index e4a7e0056d..9986182495 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -93,11 +93,27 @@ void MPT::create_mpt_model(FFModel &ff,
       layernorm_output = res_ln_outputs[1];
     }
 
-    Tensor attn_outputs;
+    Tensor qkv_proj = ff.dense(
+        layernorm_output,
+        mpt_config.hidden_size *
+            3, // q, k, v. need to change if want to remove replication.
+               // (q_heads + 2 * kv_heads) * proj_size
+        AC_MODE_NONE,
+        false,         // seems like it does not use bias
+        DT_NONE,       // what is this
+        nullptr,       // ?
+        nullptr,       // ?
+        nullptr,       // ?
+        REG_MODE_NONE, // no regularization
+        0.0f,          // no dropout
+        std::string("layers." + std::to_string(i) + ".attn.qkv_proj")
+            .c_str());
+
+    Tensor o_proj;
     switch (mode) {
       case BEAM_SEARCH_MODE: {
-        attn_outputs = ff.spec_inc_multihead_self_attention(
-            layernorm_output,
+        o_proj = ff.spec_inc_multihead_self_attention(
+            qkv_proj,
             mpt_config.hidden_size,
             mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
@@ -120,8 +136,8 @@ void MPT::create_mpt_model(FFModel &ff,
         break;
       }
       case TREE_VERIFY_MODE: {
-        attn_outputs = ff.inc_multihead_self_attention_verify(
-            layernorm_output,
+        o_proj = ff.inc_multihead_self_attention_verify(
+            qkv_proj,
             mpt_config.hidden_size,
             mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
@@ -144,8 +160,8 @@ void MPT::create_mpt_model(FFModel &ff,
         break;
       }
       case INC_DECODING_MODE: {
-        attn_outputs = ff.inc_multihead_self_attention(
-            layernorm_output,
+        o_proj = ff.inc_multihead_self_attention(
+            qkv_proj,
             mpt_config.hidden_size,
             mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
@@ -172,6 +188,20 @@ void MPT::create_mpt_model(FFModel &ff,
       }
     }
 
+    Tensor attn_outputs = ff.dense(
+        o_proj,
+        mpt_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".attn.o_proj")
+            .c_str());
+
     ff.residual_layer_norm(
         attn_outputs,
         hidden_states,
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index b3f2ef4e17..4aea36d3d7 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -101,11 +101,27 @@ void OPT::create_opt_model(FFModel &ff,
     Tensor residual = res_ln_outputs[0];
     Tensor hidden_states = res_ln_outputs[1];
 
-    Tensor mha;
+    Tensor qkv_proj = ff.dense(
+        hidden_states,
+        opt_config.hidden_size *
+            3, // q, k, v. need to change if want to remove replication.
+               // (q_heads + 2 * kv_heads) * proj_size
+        AC_MODE_NONE,
+        false,         // seems like it does not use bias
+        DT_NONE,       // what is this
+        nullptr,       // ?
+        nullptr,       // ?
+        nullptr,       // ?
+        REG_MODE_NONE, // no regularization
+        0.0f,          // no dropout
+        std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj")
+            .c_str());
+
+    Tensor o_proj;
     switch (mode) {
       case BEAM_SEARCH_MODE: {
-        mha = ff.spec_inc_multihead_self_attention(
-            hidden_states,
+        o_proj = ff.spec_inc_multihead_self_attention(
+            qkv_proj,
             opt_config.hidden_size,
             opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
@@ -128,8 +144,8 @@ void OPT::create_opt_model(FFModel &ff,
         break;
       }
       case TREE_VERIFY_MODE: {
-        mha = ff.inc_multihead_self_attention_verify(
-            hidden_states,
+        o_proj = ff.inc_multihead_self_attention_verify(
+            qkv_proj,
             opt_config.hidden_size,
             opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
@@ -152,8 +168,8 @@ void OPT::create_opt_model(FFModel &ff,
         break;
       }
       case INC_DECODING_MODE: {
-        mha = ff.inc_multihead_self_attention(
-            hidden_states,
+        o_proj = ff.inc_multihead_self_attention(
+            qkv_proj,
             opt_config.hidden_size,
             opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
@@ -180,6 +196,20 @@ void OPT::create_opt_model(FFModel &ff,
       }
     }
 
+    Tensor mha = ff.dense(
+        o_proj,
+        opt_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
+            .c_str());
+
     ff.add_bias_residual_layer_norm(mha,
                                     residual,
                                     res_ln_outputs,
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index cd8bf3a9a7..887696ff31 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -102,11 +102,28 @@ void STARCODER::create_starcoder_model(
     Tensor hidden_states = res_ln_outputs[0];
     Tensor ln_1 = res_ln_outputs[1];
 
+    Tensor qkv_proj = ff.dense(
+        ln_1,
+        startcoder_config.hidden_size *
+            3, // q, k, v. need to change if want to remove replication.
+               // (q_heads + 2 * kv_heads) * proj_size
+        AC_MODE_NONE,
+        false,         // seems like it does not use bias
+        DT_NONE,       // what is this
+        nullptr,       // ?
+        nullptr,       // ?
+        nullptr,       // ?
+        REG_MODE_NONE, // no regularization
+        0.0f,          // no dropout
+        std::string("layers." + std::to_string(i) + ".self_attention.qkv_proj")
+            .c_str());
+
     Tensor mha;
+    Tensor o_proj;
     switch (mode) {
       case INC_DECODING_MODE: {
-        mha = ff.inc_multiquery_self_attention(
-            ln_1,
+        o_proj = ff.inc_multiquery_self_attention(
+            qkv_proj,
             startcoder_config.hidden_size,
             startcoder_config.num_attention_heads,
             1,
@@ -135,6 +152,20 @@ void STARCODER::create_starcoder_model(
       }
     }
 
+    mha = ff.dense(
+        o_proj,
+        startcoder_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
+            .c_str());
+
     ff.residual_layer_norm(
         hidden_states,
         mha,
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 52d3bf8b5d..1f012e405d 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -131,10 +131,10 @@ def build_model(self, max_tokens_per_batch):
 
             qkv_proj = ffmodel.dense(
                 layernorm_output,
-                3 * self.falcon_config.hidden_size,
+                3 * self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers.{i}.self_attn.qkv_proj",
+                name=f"layers.{i}.attn.qkv_proj",
             )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -208,7 +208,7 @@ def build_model(self, max_tokens_per_batch):
                 self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers.{i}.self_attn.o_proj"
+                name=f"layers.{i}.attn.o_proj"
             )
 
             hidden_states, layernorm_output = ffmodel.residual_layer_norm(
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 2188288a68..de66927c1b 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -936,7 +936,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   // self_attn.qkv_proj or self_attn.o_proj
   // so looking for self_attn. in the name can determine if it is an attention
   // projection
-  if (weight_filename.find("self_attn.") != std::string::npos || weight_filename.find("self_attention.") != std::string::npos) {
+  if (weight_filename.find("attn.") != std::string::npos || weight_filename.find("self_attention.") != std::string::npos) {
     size_t pos = weight_filename.find(".o_proj");
     if (pos != std::string::npos) {
       weight_filename.replace(pos, std::string(".o_proj").length(), "");