Support Group Attention (Llama 2) (#883)

* n_kv_heads in inc_mha * . * . * . * . * . * fix * fix * tensor parallelism * change weight layout * tensor parallelism * merge multiquery attention into inc_mha * llama2 70B config * spec infer change 1 * fix. * spec infer. * falcon spec infer. * fix llama 70B * fix * fix & cleanup * fix * hip rocm * issue 908 * clean debug code. * format. * remove multiquery. remove warning, fix python. --------- Co-authored-by: goliaro <goliaro@cs.cmu.edu>
flexflow · Aug 3, 2023 · d1ef0ed · d1ef0ed
1 parent ba91733
commit d1ef0ed
Show file tree

Hide file tree

Showing 59 changed files with 1,412 additions and 3,617 deletions.
diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc
@@ -79,6 +79,7 @@ Tensor create_moe_encoder(FFModel *model,
                          x,
                          moeConfig->hidden_size,
                          moeConfig->num_attention_heads,
+                         moeConfig->num_attention_heads,
                          moeConfig->attention_kdim,
                          moeConfig->attention_vdim)
                    : model->multihead_attention(x,

diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc
@@ -47,6 +47,7 @@ Tensor create_inc_multihead_attention_decoder(
                 input,
                 transformerConfig->hidden_size,
                 transformerConfig->num_attention_heads,
+                transformerConfig->num_attention_heads,
                 transformerConfig->attention_kdim,
                 transformerConfig->attention_vdim)
           : model->multihead_attention(input,

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
@@ -167,7 +167,6 @@ enum OperatorType {
   OP_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
-  OP_INC_MULTIQUERY_SELF_ATTENTION,
   OP_SAMPLING,
   // Parallel Ops
   OP_REPARTITION,
@@ -180,7 +179,13 @@ enum OperatorType {
   OP_INVALID,
 };
 
-enum ModelType { UNKNOWN = 3001, LLAMA = 3002, OPT = 3003, FALCON = 3004 };
+enum ModelType {
+  UNKNOWN = 3001,
+  LLAMA = 3002,
+  LLAMA2 = 3003,
+  OPT = 3004,
+  FALCON = 3005
+};
 
 enum PMParameter {
   PM_OP_TYPE,            // AnyOp

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
@@ -401,6 +401,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention(
     const flexflow_tensor_t input_,
     int embed_dim,
     int num_heads,
+    int num_kv_heads,
     int kdim,
     int vdim,
     float dropout,
@@ -420,6 +421,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention(
     const flexflow_tensor_t input_,
     int embed_dim,
     int num_heads,
+    int num_kv_heads,
     int kdim,
     int vdim,
     float dropout,
@@ -439,6 +441,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     const flexflow_tensor_t input_,
     int embed_dim,
     int num_heads,
+    int num_kv_heads,
     int kdim,
     int vdim,
     float dropout,
@@ -453,21 +456,6 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     bool qk_prod_scaling,
     char const *name);
 
-flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
-    flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
-    int embed_dim,
-    int num_heads,
-    int kdim,
-    int vdim,
-    float dropout,
-    bool bias,
-    bool add_bias_kv,
-    bool add_zero_attn,
-    enum DataType data_type,
-    flexflow_initializer_t kernel_initializer_,
-    char const *name);
-
 flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
                                               const flexflow_tensor_t input_,
                                               float eps,
@@ -901,7 +889,8 @@ flexflow_file_data_loader_t
     flexflow_file_data_loader_create(char const *weight_file_path,
                                      int num_heads,
                                      int hidden_dim,
-                                     int qkv_inner_dim);
+                                     int qkv_inner_dim,
+                                     int tensor_partition_num);
 
 void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
 

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -156,8 +156,6 @@ enum TaskIDs {
   INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
-  INC_MULTIQUERY_SELF_ATTENTION_INIT_TASK_ID,
-  INC_MULTIQUERY_SELF_ATTENTION_INF_TASK_ID,
   SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
   TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
@@ -322,7 +320,6 @@ class Transpose;
 class RMSNorm;
 class BeamTopK;
 class SpecIncMultiHeadSelfAttention;
-class IncMultiQuerySelfAttention;
 class Sampling;
 class ArgMax;
 class Combine;
@@ -644,6 +641,7 @@ class FFModel {
   Tensor inc_multihead_self_attention(const Tensor input,
                                       int embed_dim,
                                       int num_heads,
+                                      int num_kv_heads,
                                       int kdim = 0,
                                       int vdim = 0,
                                       float dropout = 0.0f,
@@ -657,22 +655,11 @@ class FFModel {
                                       float scaling_factor = 1.0f,
                                       bool qk_prod_scaling = true,
                                       char const *name = NULL);
-  Tensor inc_multiquery_self_attention(const Tensor input,
-                                       int embed_dim,
-                                       int num_heads,
-                                       int kdim = 0,
-                                       int vdim = 0,
-                                       float dropout = 0.0f,
-                                       bool bias = false,
-                                       bool add_bias_kv = false,
-                                       bool add_zero_attn = false,
-                                       DataType data_type = DT_NONE,
-                                       Initializer *kernel_initializer = NULL,
-                                       char const *name = NULL);
   Tensor
       spec_inc_multihead_self_attention(const Tensor input,
                                         int embed_dim,
                                         int num_heads,
+                                        int num_kv_heads,
                                         int kdim = 0,
                                         int vdim = 0,
                                         float dropout = 0.0f,
@@ -690,6 +677,7 @@ class FFModel {
       const Tensor input,
       int embed_dim,
       int num_heads,
+      int num_kv_heads,
       int kdim = 0,
       int vdim = 0,
       float dropout = 0.0f,
@@ -1075,9 +1063,6 @@ class FFModel {
       std::unordered_map<
           std::pair<ParallelTensorShape, IncMultiHeadSelfAttentionParams>,
           IncMultiHeadSelfAttention *>,
-      std::unordered_map<
-          std::pair<ParallelTensorShape, IncMultiQuerySelfAttentionParams>,
-          IncMultiQuerySelfAttention *>,
       std::unordered_map<std::pair<ParallelTensorShape, BeamTopKParams>,
                          BeamTopK *>,
       std::unordered_map<std::pair<ParallelTensorShape, SamplingParams>,

diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
@@ -20,7 +20,6 @@
 #include "flexflow/ops/gather_params.h"
 #include "flexflow/ops/groupby_params.h"
 #include "flexflow/ops/inc_multihead_self_attention_params.h"
-#include "flexflow/ops/inc_multiquery_attention_params.h"
 #include "flexflow/ops/layer_norm_params.h"
 #include "flexflow/ops/linear_params.h"
 #include "flexflow/ops/pool_2d_params.h"

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -29,6 +29,7 @@ class IncMultiHeadSelfAttention : public Op {
                             const ParallelTensor _input,
                             int _embed_dim,
                             int _num_heads,
+                            int _num_kv_heads,
                             int _kdim,
                             int _vdim,
                             float _dropout,
@@ -42,12 +43,14 @@ class IncMultiHeadSelfAttention : public Op {
                             bool allocate_weights,
                             DataType _quantization_type,
                             bool _offload,
+                            int _tensor_parallelism_degree,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
                             const ParallelTensor _input,
                             const ParallelTensor _weight,
                             int _embed_dim,
                             int _num_heads,
+                            int _num_kv_heads,
                             int _kdim,
                             int _vdim,
                             float _dropout,
@@ -61,6 +64,7 @@ class IncMultiHeadSelfAttention : public Op {
                             bool allocate_weights,
                             DataType _quantization_type,
                             bool _offload,
+                            int _tensor_parallelism_degree,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
                             IncMultiHeadSelfAttention const &other,
@@ -114,7 +118,7 @@ class IncMultiHeadSelfAttention : public Op {
   Params get_params() const;
 
 public:
-  int num_heads;
+  int num_heads, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
   bool bias;
   bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query,
@@ -132,7 +136,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
                                 GenericTensorAccessorR const &weight,
                                 MemoryAllocator &gpu_mem_allocator,
                                 int num_samples,
-                                int _num_heads);
+                                int _num_heads,
+                                int _num_kv_heads);
   IncMultiHeadSelfAttentionMeta(FFHandler handler,
                                 InferenceMode infer_mode,
                                 Op const *attn,
@@ -153,7 +158,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
                                 MemoryAllocator &gpu_mem_allocator,
                                 int num_samples,
                                 int _global_num_heads,
+                                int _global_num_kv_heads,
                                 int _num_heads,
+                                int _num_kv_heads,
                                 DataType _quantization_type,
                                 bool _offload);
   ~IncMultiHeadSelfAttentionMeta(void);
@@ -163,7 +170,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   size_t weights_params, weightSize, biasSize, reserveSpaceSize,
       quantized_weightSize;
   int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
-  int global_num_heads, num_heads;
+  int global_num_heads, global_num_kv_heads, num_heads, num_kv_heads;
   bool *has_load_weights;
   bool *apply_rotary_embedding;
   bool *bias;
@@ -182,6 +189,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   DataType quantization_type;
   bool offload;
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+  cudnnTensorDescriptor_t qk_tensor;
   cuFloatComplex *complex_input;
 #endif
 };

diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h
@@ -8,7 +8,7 @@ namespace FlexFlow {
 
 struct IncMultiHeadSelfAttentionParams {
   LayerID layer_guid;
-  int embed_dim, num_heads, kdim, vdim;
+  int embed_dim, num_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
   bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query,
       qk_prod_scaling;

diff --git a/include/flexflow/ops/inc_multiquery_attention_params.h b/include/flexflow/ops/inc_multiquery_attention_params.h