Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
xinhaoc committed Jul 23, 2023
1 parent c268e9e commit 55927a3
Show file tree
Hide file tree
Showing 12 changed files with 231 additions and 117 deletions.
1 change: 1 addition & 0 deletions examples/cpp/inference/mixture_of_experts/moe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ Tensor create_moe_encoder(FFModel *model,
x,
moeConfig->hidden_size,
moeConfig->num_attention_heads,
moeConfig->num_attention_heads,
moeConfig->attention_kdim,
moeConfig->attention_vdim)
: model->multihead_attention(x,
Expand Down
1 change: 1 addition & 0 deletions examples/cpp/inference/transformers/transformers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Tensor create_inc_multihead_attention_decoder(
input,
transformerConfig->hidden_size,
transformerConfig->num_attention_heads,
transformerConfig->num_attention_heads,
transformerConfig->attention_kdim,
transformerConfig->attention_vdim)
: model->multihead_attention(input,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ template <typename DT>
__global__ void apply_proj_bias_w(DT *input_ptr,
DT const *bias_ptr,
int num_tokens,
int qkv_weight_size,
int oProjSize);

template <typename DT>
Expand All @@ -34,6 +35,7 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr,
int kProjSize,
int vProjSize,
int num_heads,
int num_kv_heads,
bool scaling_query,
float scaling_factor);

Expand All @@ -46,9 +48,10 @@ __global__ void
int kProjSize,
int num_heads,
int num_tokens,
int num_kv_heads,
int q_block_size,
int k_block_size,
int v_block_size,
int q_array_size,
bool q_tensor);

template <typename DT>
Expand Down
14 changes: 10 additions & 4 deletions inference/file_loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ using namespace Legion;
FileDataLoader::FileDataLoader(std::string _input_path,
std::string _weight_file_path,
int _num_heads,
int _num_kv_heads,
size_t _hidden_dim,
size_t _qkv_inner_dim)
: input_path(_input_path), weight_file_path(_weight_file_path),
num_heads(_num_heads), hidden_dim(_hidden_dim),
num_heads(_num_heads), num_kv_heads(_num_kv_heads), hidden_dim(_hidden_dim),
qkv_inner_dim(_qkv_inner_dim){};

BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) {
Expand Down Expand Up @@ -279,6 +280,7 @@ void load_attention_weights_multi_query(DT *ptr,
template <typename DT>
void load_attention_bias_v2(DT *ptr,
int num_heads,
int num_kv_heads,
size_t hidden_dim,
size_t qkv_inner_dim,
std::string layer_name,
Expand All @@ -298,8 +300,10 @@ void load_attention_bias_v2(DT *ptr,
std::vector<std::string> bias_files = {q_file, k_file, v_file, o_file};

int file_index = 0;

for (auto file : bias_files) {
size_t qkv_partial_size = qkv_inner_dim * num_heads;
int n_heads = file_index == 0 ? num_heads : num_kv_heads;
size_t qkv_partial_size = qkv_inner_dim * n_heads;
size_t out_partial_size = hidden_dim;
size_t partial_size =
(file_index < 3) ? qkv_partial_size : out_partial_size;
Expand Down Expand Up @@ -785,16 +789,18 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,

if (file_path.find("attention_w") != std::string::npos) {
if (weight_idx == 0) {
load_attention_weights(data,
load_attention_weights_v2(data,
num_heads,
num_kv_heads,
hidden_dim,
qkv_inner_dim,
file_path,
weight_file_path,
volume);
} else {
load_attention_bias(data,
load_attention_bias_v2(data,
num_heads,
num_kv_heads,
hidden_dim,
qkv_inner_dim,
file_path,
Expand Down
3 changes: 2 additions & 1 deletion inference/file_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class FileDataLoader {
FileDataLoader(std::string _input_path,
std::string _weight_file_path,
int _num_heads,
int _num_kv_heads,
size_t _hidden_dim,
size_t _qkv_inner_dim);

Expand Down Expand Up @@ -54,7 +55,7 @@ class FileDataLoader {
int offset);

private:
int num_heads;
int num_heads, num_kv_heads;
size_t hidden_dim, qkv_inner_dim;
std::string input_path;
std::string weight_file_path;
Expand Down
1 change: 1 addition & 0 deletions inference/models/falcon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ void FALCON::create_falcon_model(FFModel &ff,
FileDataLoader fileloader("",
weight_file_path,
falcon_config.n_heads,
1,
falcon_config.dim,
falcon_config.dim / falcon_config.n_heads);
fileloader.load_weights(&ff, weights_layers, use_full_precision);
Expand Down
2 changes: 2 additions & 0 deletions inference/models/llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ void LLAMA::create_llama_model(FFModel &ff,
att_norm,
llama_config.dim,
llama_config.n_heads,
llama_config.n_heads,
llama_config.dim / llama_config.n_heads,
llama_config.dim / llama_config.n_heads,
0.0f, /*dropout*/
Expand Down Expand Up @@ -227,6 +228,7 @@ void LLAMA::create_llama_model(FFModel &ff,
FileDataLoader fileloader("",
weight_file_path,
llama_config.n_heads,
llama_config.n_heads,
llama_config.dim,
llama_config.dim / llama_config.n_heads);
fileloader.load_weights(&ff, weights_layers, use_full_precision);
Expand Down
2 changes: 2 additions & 0 deletions inference/models/opt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ void OPT::create_opt_model(FFModel &ff,
hidden_states,
opt_config.hidden_size,
opt_config.num_attention_heads,
opt_config.num_attention_heads,
opt_config.hidden_size / opt_config.num_attention_heads,
opt_config.hidden_size / opt_config.num_attention_heads,
0.0f,
Expand Down Expand Up @@ -244,6 +245,7 @@ void OPT::create_opt_model(FFModel &ff,
FileDataLoader fileloader("",
weight_file_path,
opt_config.num_attention_heads,
opt_config.num_attention_heads,
opt_config.hidden_size,
opt_config.hidden_size /
opt_config.num_attention_heads);
Expand Down
1 change: 1 addition & 0 deletions src/c/flexflow_c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention(
Tensor tensor = handle->inc_multihead_self_attention(input,
embed_dim,
num_heads,
num_heads,
kdim,
vdim,
dropout,
Expand Down
Loading

0 comments on commit 55927a3

Please sign in to comment.