Skip to content

Commit

Permalink
Attention projections (QKV, O) disaggregation (#1436)
Browse files Browse the repository at this point in the history
* merged attn-qkv-proj into peft.
commented out some alignment test, but should be equivalent to the oriinal test.

* restored and passed the alignement test

* linting

* rebased onto inference

* Bug fixes, uploaded missing cpp implmentation

* Code cleanup

* clean up

* fixed problem with mpt.

* update

* llama3.1 support

* fix

* support llama3.2

* fix opt bias?

* opt alignment test stub

* fix bias

* update

* fix non-fusion opt

* update

* fix

* cleanup

* delete file

* cleanup

* shellcheck

* hip cleanup

* fix

* hip fixes

---------

Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>
Co-authored-by: zhihao <email>
Co-authored-by: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
  • Loading branch information
3 people authored Oct 9, 2024
1 parent ca3dabf commit 96628b3
Show file tree
Hide file tree
Showing 67 changed files with 5,146 additions and 5,895 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,6 @@ lora_training_logs
Untitled-1.ipynb
Untitled-2.ipynb
tests/inference/python_test_configs/*.json

core.*
fine_grained_alignment_config.json
6 changes: 0 additions & 6 deletions examples/python/native/ops/inc_multihead_self_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ def test_inc_multihead_self_attention(
kdim: int = 0,
vdim: int = 0,
dropout: float = 0.0,
bias: bool = True,
add_bias_kv: bool = False,
add_zero_attn: bool = False,
data_type: DataType = DataType.DT_NONE,
kernel_initializer=None,
Expand All @@ -34,8 +32,6 @@ def test_inc_multihead_self_attention(
kdim=kdim,
vdim=vdim,
dropout=dropout,
bias=bias,
add_bias_kv=add_bias_kv,
add_zero_attn=add_zero_attn,
data_type=data_type,
kernel_initializer=kernel_initializer,
Expand Down Expand Up @@ -85,8 +81,6 @@ def test_inc_multihead_self_attention(
kdim=0, # Example value for kdim
vdim=0, # Example value for vdim
dropout=0.1, # Example value for dropout
bias=True,
add_bias_kv=False,
add_zero_attn=False,
data_type=DataType.DT_FLOAT,
kernel_initializer=None, # Example value for kernel_initializer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ def test_inc_multihead_self_attention_verify(
kdim: int = 0,
vdim: int = 0,
dropout: float = 0.0,
bias: bool = True,
add_bias_kv: bool = False,
add_zero_attn: bool = False,
data_type: DataType = DataType.DT_NONE,
kernel_initializer=None,
Expand All @@ -34,8 +32,6 @@ def test_inc_multihead_self_attention_verify(
kdim=kdim,
vdim=vdim,
dropout=dropout,
bias=bias,
add_bias_kv=add_bias_kv,
add_zero_attn=add_zero_attn,
data_type=data_type,
kernel_initializer=kernel_initializer,
Expand Down Expand Up @@ -85,8 +81,6 @@ def test_inc_multihead_self_attention_verify(
kdim=0, # Example value for kdim
vdim=0, # Example value for vdim
dropout=0.1, # Example value for dropout
bias=True,
add_bias_kv=False,
add_zero_attn=False,
data_type=DataType.DT_FLOAT,
kernel_initializer=None, # Example value for kernel_initializer
Expand Down
6 changes: 0 additions & 6 deletions examples/python/native/ops/inc_multiquery_self_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ def test_inc_multiquery_self_attention(
kdim: int = 0,
vdim: int = 0,
dropout: float = 0.0,
bias: bool = True,
add_bias_kv: bool = False,
add_zero_attn: bool = False,
data_type: DataType = DataType.DT_NONE,
kernel_initializer=None,
Expand All @@ -36,8 +34,6 @@ def test_inc_multiquery_self_attention(
kdim=kdim,
vdim=vdim,
dropout=dropout,
bias=bias,
add_bias_kv=add_bias_kv,
add_zero_attn=add_zero_attn,
data_type=data_type,
kernel_initializer=kernel_initializer,
Expand Down Expand Up @@ -89,8 +85,6 @@ def test_inc_multiquery_self_attention(
kdim=0, # Example value for kdim
vdim=0, # Example value for vdim
dropout=0.1, # Example value for dropout
bias=True,
add_bias_kv=False,
add_zero_attn=False,
data_type=DataType.DT_FLOAT,
kernel_initializer=None, # Example value for kernel_initializer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ def test_inc_multiquery_self_attention_verify(
kdim: int = 0,
vdim: int = 0,
dropout: float = 0.0,
bias: bool = True,
add_bias_kv: bool = False,
add_zero_attn: bool = False,
data_type: DataType = DataType.DT_NONE,
kernel_initializer=None,
Expand All @@ -36,8 +34,6 @@ def test_inc_multiquery_self_attention_verify(
kdim=kdim,
vdim=vdim,
dropout=dropout,
bias=bias,
add_bias_kv=add_bias_kv,
add_zero_attn=add_zero_attn,
data_type=data_type,
kernel_initializer=kernel_initializer,
Expand Down Expand Up @@ -89,8 +85,6 @@ def test_inc_multiquery_self_attention_verify(
kdim=0, # Example value for kdim
vdim=0, # Example value for vdim
dropout=0.1, # Example value for dropout
bias=True,
add_bias_kv=False,
add_zero_attn=False,
data_type=DataType.DT_FLOAT,
kernel_initializer=None, # Example value for kernel_initializer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ def test_spec_inc_multihead_self_attention(
kdim: int = 0,
vdim: int = 0,
dropout: float = 0.0,
bias: bool = True,
add_bias_kv: bool = False,
add_zero_attn: bool = False,
data_type: DataType = DataType.DT_NONE,
kernel_initializer=None,
Expand All @@ -34,8 +32,6 @@ def test_spec_inc_multihead_self_attention(
kdim=kdim,
vdim=vdim,
dropout=dropout,
bias=bias,
add_bias_kv=add_bias_kv,
add_zero_attn=add_zero_attn,
data_type=data_type,
kernel_initializer=kernel_initializer,
Expand Down Expand Up @@ -85,8 +81,6 @@ def test_spec_inc_multihead_self_attention(
kdim=0, # Example value for kdim
vdim=0, # Example value for vdim
dropout=0.1, # Example value for dropout
bias=True,
add_bias_kv=False,
add_zero_attn=False,
data_type=DataType.DT_FLOAT,
kernel_initializer=None, # Example value for kernel_initializer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ def test_spec_inc_multiquery_self_attention(
kdim: int = 0,
vdim: int = 0,
dropout: float = 0.0,
bias: bool = True,
add_bias_kv: bool = False,
add_zero_attn: bool = False,
data_type: DataType = DataType.DT_NONE,
kernel_initializer=None,
Expand All @@ -36,8 +34,6 @@ def test_spec_inc_multiquery_self_attention(
kdim=kdim,
vdim=vdim,
dropout=dropout,
bias=bias,
add_bias_kv=add_bias_kv,
add_zero_attn=add_zero_attn,
data_type=data_type,
kernel_initializer=kernel_initializer,
Expand Down Expand Up @@ -89,8 +85,6 @@ def test_spec_inc_multiquery_self_attention(
kdim=0, # Example value for kdim
vdim=0, # Example value for vdim
dropout=0.1, # Example value for dropout
bias=True,
add_bias_kv=False,
add_zero_attn=False,
data_type=DataType.DT_FLOAT,
kernel_initializer=None, # Example value for kernel_initializer
Expand Down
48 changes: 36 additions & 12 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -445,12 +445,16 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
int kdim,
int vdim,
float dropout,
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -465,12 +469,16 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
int kdim,
int vdim,
float dropout,
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -485,12 +493,16 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
int kdim,
int vdim,
float dropout,
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -506,12 +518,16 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
int kdim,
int vdim,
float dropout,
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -527,12 +543,16 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
int kdim,
int vdim,
float dropout,
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -548,12 +568,16 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
int kdim,
int vdim,
float dropout,
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand Down
39 changes: 37 additions & 2 deletions include/flexflow/inference.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,43 @@ struct GenerationResult {
std::vector<float> finetuning_losses;
};

#include <string>
#include <vector>
struct RotaryEmbeddingMeta {
bool apply_rotary_embedding = false;
float rope_theta = 10000.0f;
std::string rope_type = "default";
float factor = 8.0f;
float low_freq_factor = 1.0f;
float high_freq_factor = 4.0f;
int original_max_position_embeddings = 8192;

RotaryEmbeddingMeta(bool apply_rotary_embedding_ = false,
float rope_theta_ = 10000.0f,
std::string rope_type_ = "default",
float factor_ = 8.0f,
float low_freq_factor_ = 1.0f,
float high_freq_factor_ = 4.0f,
int original_max_position_embeddings_ = 8192)
: apply_rotary_embedding(apply_rotary_embedding_),
rope_theta(rope_theta_), rope_type(rope_type_), factor(factor_),
low_freq_factor(low_freq_factor_), high_freq_factor(high_freq_factor_),
original_max_position_embeddings(original_max_position_embeddings_) {}

friend std::ostream &operator<<(std::ostream &os,
RotaryEmbeddingMeta const &meta) {
os << std::boolalpha // To print bool as true/false instead of 1/0
<< "RotaryEmbeddingMeta {\n"
<< " apply_rotary_embedding: " << meta.apply_rotary_embedding << ",\n"
<< " rope_theta: " << meta.rope_theta << ",\n"
<< " rope_type: \"" << meta.rope_type << "\",\n"
<< " factor: " << meta.factor << ",\n"
<< " low_freq_factor: " << meta.low_freq_factor << ",\n"
<< " high_freq_factor: " << meta.high_freq_factor << ",\n"
<< " original_max_position_embeddings: "
<< meta.original_max_position_embeddings << "\n"
<< "}";
return os;
}
};

std::string join_path(std::vector<std::string> const &paths);

Expand Down
3 changes: 3 additions & 0 deletions include/flexflow/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@ class Layer {
void add_float_property(std::string const &key, float value);
void add_int_vector_property(std::string const &key,
std::vector<int> const &value);
void add_string_property(std::string const &key, std::string const &value);
void add_initializer(std::string const &key, Initializer *initializer);
bool get_int_property(std::string const &key, long long &value) const;
bool get_float_property(std::string const &key, float &value) const;
bool get_int_vector_property(std::string const &key,
std::vector<int> &value) const;
bool get_string_property(std::string const &key, std::string &value) const;
bool get_initializer(std::string const &key, Initializer *&initializer) const;
Tensor get_parameter(int index);
void print();
Expand All @@ -59,6 +61,7 @@ class Layer {
std::unordered_map<std::string, float> float_properties;
std::unordered_map<std::string, Initializer *> initializers;
std::unordered_map<std::string, std::vector<int>> int_vector_properties;
std::unordered_map<std::string, std::string> string_properties;
};

}; // namespace FlexFlow
Loading

0 comments on commit 96628b3

Please sign in to comment.