Skip to content

Commit

Permalink
Support Group Attention (Llama 2) (#883)
Browse files Browse the repository at this point in the history
* n_kv_heads in inc_mha

* .

* .

* .

* .

* .

* fix

* fix

* tensor parallelism

* change weight layout

* tensor parallelism

* merge multiquery attention into inc_mha

* llama2 70B config

* spec infer change 1

* fix.

* spec infer.

* falcon spec infer.

* fix llama 70B

* fix

* fix & cleanup

* fix

* hip rocm

* issue 908

* clean debug code.

* format.

* remove multiquery. remove warning, fix python.

---------

Co-authored-by: goliaro <goliaro@cs.cmu.edu>
  • Loading branch information
xinhaoc and goliaro authored Aug 3, 2023
1 parent ba91733 commit d1ef0ed
Show file tree
Hide file tree
Showing 59 changed files with 1,412 additions and 3,617 deletions.
1 change: 1 addition & 0 deletions examples/cpp/inference/mixture_of_experts/moe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ Tensor create_moe_encoder(FFModel *model,
x,
moeConfig->hidden_size,
moeConfig->num_attention_heads,
moeConfig->num_attention_heads,
moeConfig->attention_kdim,
moeConfig->attention_vdim)
: model->multihead_attention(x,
Expand Down
1 change: 1 addition & 0 deletions examples/cpp/inference/transformers/transformers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ Tensor create_inc_multihead_attention_decoder(
input,
transformerConfig->hidden_size,
transformerConfig->num_attention_heads,
transformerConfig->num_attention_heads,
transformerConfig->attention_kdim,
transformerConfig->attention_vdim)
: model->multihead_attention(input,
Expand Down
9 changes: 7 additions & 2 deletions include/flexflow/ffconst.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ enum OperatorType {
OP_INC_MULTIHEAD_SELF_ATTENTION,
OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
OP_INC_MULTIQUERY_SELF_ATTENTION,
OP_SAMPLING,
// Parallel Ops
OP_REPARTITION,
Expand All @@ -180,7 +179,13 @@ enum OperatorType {
OP_INVALID,
};

enum ModelType { UNKNOWN = 3001, LLAMA = 3002, OPT = 3003, FALCON = 3004 };
enum ModelType {
UNKNOWN = 3001,
LLAMA = 3002,
LLAMA2 = 3003,
OPT = 3004,
FALCON = 3005
};

enum PMParameter {
PM_OP_TYPE, // AnyOp
Expand Down
21 changes: 5 additions & 16 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention(
const flexflow_tensor_t input_,
int embed_dim,
int num_heads,
int num_kv_heads,
int kdim,
int vdim,
float dropout,
Expand All @@ -420,6 +421,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention(
const flexflow_tensor_t input_,
int embed_dim,
int num_heads,
int num_kv_heads,
int kdim,
int vdim,
float dropout,
Expand All @@ -439,6 +441,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
const flexflow_tensor_t input_,
int embed_dim,
int num_heads,
int num_kv_heads,
int kdim,
int vdim,
float dropout,
Expand All @@ -453,21 +456,6 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
bool qk_prod_scaling,
char const *name);

flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
flexflow_model_t handle_,
const flexflow_tensor_t input_,
int embed_dim,
int num_heads,
int kdim,
int vdim,
float dropout,
bool bias,
bool add_bias_kv,
bool add_zero_attn,
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
char const *name);

flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
const flexflow_tensor_t input_,
float eps,
Expand Down Expand Up @@ -901,7 +889,8 @@ flexflow_file_data_loader_t
flexflow_file_data_loader_create(char const *weight_file_path,
int num_heads,
int hidden_dim,
int qkv_inner_dim);
int qkv_inner_dim,
int tensor_partition_num);

void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);

Expand Down
21 changes: 3 additions & 18 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,6 @@ enum TaskIDs {
INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
INC_MULTIQUERY_SELF_ATTENTION_INIT_TASK_ID,
INC_MULTIQUERY_SELF_ATTENTION_INF_TASK_ID,
SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
Expand Down Expand Up @@ -322,7 +320,6 @@ class Transpose;
class RMSNorm;
class BeamTopK;
class SpecIncMultiHeadSelfAttention;
class IncMultiQuerySelfAttention;
class Sampling;
class ArgMax;
class Combine;
Expand Down Expand Up @@ -644,6 +641,7 @@ class FFModel {
Tensor inc_multihead_self_attention(const Tensor input,
int embed_dim,
int num_heads,
int num_kv_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
Expand All @@ -657,22 +655,11 @@ class FFModel {
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
char const *name = NULL);
Tensor inc_multiquery_self_attention(const Tensor input,
int embed_dim,
int num_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
char const *name = NULL);
Tensor
spec_inc_multihead_self_attention(const Tensor input,
int embed_dim,
int num_heads,
int num_kv_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
Expand All @@ -690,6 +677,7 @@ class FFModel {
const Tensor input,
int embed_dim,
int num_heads,
int num_kv_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
Expand Down Expand Up @@ -1075,9 +1063,6 @@ class FFModel {
std::unordered_map<
std::pair<ParallelTensorShape, IncMultiHeadSelfAttentionParams>,
IncMultiHeadSelfAttention *>,
std::unordered_map<
std::pair<ParallelTensorShape, IncMultiQuerySelfAttentionParams>,
IncMultiQuerySelfAttention *>,
std::unordered_map<std::pair<ParallelTensorShape, BeamTopKParams>,
BeamTopK *>,
std::unordered_map<std::pair<ParallelTensorShape, SamplingParams>,
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/operator_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include "flexflow/ops/gather_params.h"
#include "flexflow/ops/groupby_params.h"
#include "flexflow/ops/inc_multihead_self_attention_params.h"
#include "flexflow/ops/inc_multiquery_attention_params.h"
#include "flexflow/ops/layer_norm_params.h"
#include "flexflow/ops/linear_params.h"
#include "flexflow/ops/pool_2d_params.h"
Expand Down
14 changes: 11 additions & 3 deletions include/flexflow/ops/inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class IncMultiHeadSelfAttention : public Op {
const ParallelTensor _input,
int _embed_dim,
int _num_heads,
int _num_kv_heads,
int _kdim,
int _vdim,
float _dropout,
Expand All @@ -42,12 +43,14 @@ class IncMultiHeadSelfAttention : public Op {
bool allocate_weights,
DataType _quantization_type,
bool _offload,
int _tensor_parallelism_degree,
char const *name);
IncMultiHeadSelfAttention(FFModel &model,
const ParallelTensor _input,
const ParallelTensor _weight,
int _embed_dim,
int _num_heads,
int _num_kv_heads,
int _kdim,
int _vdim,
float _dropout,
Expand All @@ -61,6 +64,7 @@ class IncMultiHeadSelfAttention : public Op {
bool allocate_weights,
DataType _quantization_type,
bool _offload,
int _tensor_parallelism_degree,
char const *name);
IncMultiHeadSelfAttention(FFModel &model,
IncMultiHeadSelfAttention const &other,
Expand Down Expand Up @@ -114,7 +118,7 @@ class IncMultiHeadSelfAttention : public Op {
Params get_params() const;

public:
int num_heads;
int num_heads, num_kv_heads, tensor_parallelism_degree;
float dropout, scaling_factor;
bool bias;
bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query,
Expand All @@ -132,7 +136,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
GenericTensorAccessorR const &weight,
MemoryAllocator &gpu_mem_allocator,
int num_samples,
int _num_heads);
int _num_heads,
int _num_kv_heads);
IncMultiHeadSelfAttentionMeta(FFHandler handler,
InferenceMode infer_mode,
Op const *attn,
Expand All @@ -153,7 +158,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
MemoryAllocator &gpu_mem_allocator,
int num_samples,
int _global_num_heads,
int _global_num_kv_heads,
int _num_heads,
int _num_kv_heads,
DataType _quantization_type,
bool _offload);
~IncMultiHeadSelfAttentionMeta(void);
Expand All @@ -163,7 +170,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
size_t weights_params, weightSize, biasSize, reserveSpaceSize,
quantized_weightSize;
int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
int global_num_heads, num_heads;
int global_num_heads, global_num_kv_heads, num_heads, num_kv_heads;
bool *has_load_weights;
bool *apply_rotary_embedding;
bool *bias;
Expand All @@ -182,6 +189,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
DataType quantization_type;
bool offload;
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
cudnnTensorDescriptor_t qk_tensor;
cuFloatComplex *complex_input;
#endif
};
Expand Down
2 changes: 1 addition & 1 deletion include/flexflow/ops/inc_multihead_self_attention_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace FlexFlow {

struct IncMultiHeadSelfAttentionParams {
LayerID layer_guid;
int embed_dim, num_heads, kdim, vdim;
int embed_dim, num_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree;
float dropout, scaling_factor;
bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query,
qk_prod_scaling;
Expand Down
30 changes: 0 additions & 30 deletions include/flexflow/ops/inc_multiquery_attention_params.h

This file was deleted.

Loading

0 comments on commit d1ef0ed

Please sign in to comment.