Skip to content

Commit

Permalink
llama3.1 support
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-goliaro committed Sep 29, 2024
1 parent fbac32e commit 22aebb3
Show file tree
Hide file tree
Showing 41 changed files with 1,042 additions and 529 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,5 @@ lora_training_logs
Untitled-1.ipynb
Untitled-2.ipynb
tests/inference/python_test_configs/*.json

core.*
36 changes: 36 additions & 0 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -471,6 +477,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -491,6 +503,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -512,6 +530,12 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -533,6 +557,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -554,6 +584,12 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand Down
39 changes: 37 additions & 2 deletions include/flexflow/inference.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,43 @@ struct GenerationResult {
std::vector<float> finetuning_losses;
};

#include <string>
#include <vector>
struct RotaryEmbeddingMeta {
bool apply_rotary_embedding = false;
float rope_theta = 10000.0f;
std::string rope_type = "default";
float factor = 8.0f;
float low_freq_factor = 1.0f;
float high_freq_factor = 4.0f;
int original_max_position_embeddings = 8192;

RotaryEmbeddingMeta(bool apply_rotary_embedding_ = false,
float rope_theta_ = 10000.0f,
std::string rope_type_ = "default",
float factor_ = 8.0f,
float low_freq_factor_ = 1.0f,
float high_freq_factor_ = 4.0f,
int original_max_position_embeddings_ = 8192)
: apply_rotary_embedding(apply_rotary_embedding_),
rope_theta(rope_theta_), rope_type(rope_type_), factor(factor_),
low_freq_factor(low_freq_factor_), high_freq_factor(high_freq_factor_),
original_max_position_embeddings(original_max_position_embeddings_) {}

friend std::ostream &operator<<(std::ostream &os,
RotaryEmbeddingMeta const &meta) {
os << std::boolalpha // To print bool as true/false instead of 1/0
<< "RotaryEmbeddingMeta {\n"
<< " apply_rotary_embedding: " << meta.apply_rotary_embedding << ",\n"
<< " rope_theta: " << meta.rope_theta << ",\n"
<< " rope_type: \"" << meta.rope_type << "\",\n"
<< " factor: " << meta.factor << ",\n"
<< " low_freq_factor: " << meta.low_freq_factor << ",\n"
<< " high_freq_factor: " << meta.high_freq_factor << ",\n"
<< " original_max_position_embeddings: "
<< meta.original_max_position_embeddings << "\n"
<< "}";
return os;
}
};

std::string join_path(std::vector<std::string> const &paths);

Expand Down
3 changes: 3 additions & 0 deletions include/flexflow/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@ class Layer {
void add_float_property(std::string const &key, float value);
void add_int_vector_property(std::string const &key,
std::vector<int> const &value);
void add_string_property(std::string const &key, std::string const &value);
void add_initializer(std::string const &key, Initializer *initializer);
bool get_int_property(std::string const &key, long long &value) const;
bool get_float_property(std::string const &key, float &value) const;
bool get_int_vector_property(std::string const &key,
std::vector<int> &value) const;
bool get_string_property(std::string const &key, std::string &value) const;
bool get_initializer(std::string const &key, Initializer *&initializer) const;
Tensor get_parameter(int index);
void print();
Expand All @@ -59,6 +61,7 @@ class Layer {
std::unordered_map<std::string, float> float_properties;
std::unordered_map<std::string, Initializer *> initializers;
std::unordered_map<std::string, std::vector<int>> int_vector_properties;
std::unordered_map<std::string, std::string> string_properties;
};

}; // namespace FlexFlow
150 changes: 76 additions & 74 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -733,41 +733,42 @@ class FFModel {
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
char const *name = NULL);
Tensor inc_multihead_self_attention(const Tensor input,
int embed_dim,
int num_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
char const *name = NULL);
Tensor
spec_inc_multihead_self_attention(const Tensor input,
int embed_dim,
int num_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
char const *name = NULL);
Tensor inc_multihead_self_attention(
const Tensor input,
int embed_dim,
int num_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
char const *name = NULL);
Tensor spec_inc_multihead_self_attention(
const Tensor input,
int embed_dim,
int num_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
char const *name = NULL);
Tensor inc_multihead_self_attention_verify(
const Tensor input,
int embed_dim,
Expand All @@ -780,49 +781,50 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
char const *name = NULL);
Tensor inc_multiquery_self_attention(
const Tensor input,
int embed_dim,
int num_q_heads,
int num_kv_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
char const *name = NULL);
Tensor spec_inc_multiquery_self_attention(
const Tensor input,
int embed_dim,
int num_q_heads,
int num_kv_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
char const *name = NULL);
Tensor inc_multiquery_self_attention(const Tensor input,
int embed_dim,
int num_q_heads,
int num_kv_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
char const *name = NULL);
Tensor
spec_inc_multiquery_self_attention(const Tensor input,
int embed_dim,
int num_q_heads,
int num_kv_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
char const *name = NULL);
Tensor inc_multiquery_self_attention_verify(
const Tensor input,
int embed_dim,
Expand All @@ -836,7 +838,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand Down
8 changes: 7 additions & 1 deletion include/flexflow/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,13 @@ class Op {
// only dump the weights in the forward pass, at the first step
// note that we do not save the weight gradients, since we only support
// finetuning LoRA weights, which are not FF tensors.
if (fwd_pass && m->decoding_step == 0) {
// Set FF_DEBG_NO_WEIGHTS=1 or to FF_DEBG_NO_WEIGHTS=true to disable saving
// weights
bool do_not_save_weights =
(std::getenv("FF_DEBG_NO_WEIGHTS") &&
(std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "1" ||
std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "true"));
if (fwd_pass && m->decoding_step == 0 && !do_not_save_weights) {
fs::path dst_filepath_weights =
get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) /
layername;
Expand Down
12 changes: 6 additions & 6 deletions include/flexflow/ops/inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class IncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand All @@ -61,7 +61,7 @@ class IncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand Down Expand Up @@ -138,8 +138,8 @@ class IncMultiHeadSelfAttention : public Op {
int num_q_heads, num_kv_heads, tensor_parallelism_degree;
float dropout, scaling_factor;
bool qkv_bias;
bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
qk_prod_scaling, position_bias;
bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
RotaryEmbeddingMeta rotary_embedding_meta;
int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
int qoSeqLength, kvSeqLength;
DataType quantization_type;
Expand All @@ -165,7 +165,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
int _kProjSize,
int _vProjSize,
int _oProjSize,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _qkv_bias,
bool _scaling_query,
bool _qk_prod_scaling,
Expand All @@ -191,7 +191,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads,
hidden_size;
bool *has_load_weights;
bool *apply_rotary_embedding;
RotaryEmbeddingMeta *rotary_embedding_meta;
bool *qkv_bias;
bool *final_bias;
bool *scaling_query;
Expand Down
Loading

0 comments on commit 22aebb3

Please sign in to comment.