Skip to content

Commit

Permalink
Merge branch 'specscheduler' into paged_attention_new
Browse files Browse the repository at this point in the history
  • Loading branch information
Bob-Chen222 authored Oct 29, 2024
2 parents 50e38f6 + d09259e commit 7ba77dd
Show file tree
Hide file tree
Showing 57 changed files with 1,079 additions and 496 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,5 @@ python/flexflow/version.txt

inference_tensors
tests/inference/python_test_configs/*.json

core.*
1 change: 1 addition & 0 deletions docker/flexflow-environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
chmod +x ~/${MINICONDA_SCRIPT_NAME} && \
bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \
rm ~/${MINICONDA_SCRIPT_NAME} && \
/opt/conda/bin/conda config --set solver classic && \
/opt/conda/bin/conda upgrade --all && \
/opt/conda/bin/conda install conda-build conda-verify && \
/opt/conda/bin/conda clean -ya
Expand Down
4 changes: 3 additions & 1 deletion include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,13 @@ class StreamingCacheInfo {
void commit_cache(int len);
void reset_cache();
int global_2_cache_index(int global_index);
int cache_2_global_index(int cache_index);

public:
int sink_cache_size, window_cache_size;
// the meta info of the window cache, commit_len helps to determine if we fill
// up the window.
int window_back, commit_len;
int window_back, commit_len, total_len;
};

class BatchConfig {
Expand All @@ -77,6 +78,7 @@ class BatchConfig {
static int max_sequence_length();
static int max_output_length();
static int max_kv_cache_size();
static bool streaming_cache();
static int get_max_tree_depth();
friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
void print() const;
Expand Down
36 changes: 36 additions & 0 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -467,6 +473,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -489,6 +501,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -511,6 +529,12 @@ flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -534,6 +558,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -557,6 +587,12 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand Down
46 changes: 42 additions & 4 deletions include/flexflow/inference.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,17 @@ struct GenerationConfig {

struct GenerationRequest {
std::string prompt;
bool add_special_tokens = true;
double slo_ratio;
double emission_time_ms;

GenerationRequest(std::string const &prompt_,
double slo_ratio_,
double emission_time_ms_)
double emission_time_ms_,
bool add_special_tokens_ = true)
: prompt(prompt_), slo_ratio(slo_ratio_),
emission_time_ms(emission_time_ms_) {}
emission_time_ms(emission_time_ms_),
add_special_tokens(add_special_tokens_) {}
};

struct GenerationResult {
Expand Down Expand Up @@ -158,8 +161,43 @@ class TraceEmissionMachine : public EmissionMachine {
double sample_slo_ratio() override;
};

#include <string>
#include <vector>
struct RotaryEmbeddingMeta {
bool apply_rotary_embedding = false;
float rope_theta = 10000.0f;
std::string rope_type = "default";
float factor = 8.0f;
float low_freq_factor = 1.0f;
float high_freq_factor = 4.0f;
int original_max_position_embeddings = 8192;

RotaryEmbeddingMeta(bool apply_rotary_embedding_ = false,
float rope_theta_ = 10000.0f,
std::string rope_type_ = "default",
float factor_ = 8.0f,
float low_freq_factor_ = 1.0f,
float high_freq_factor_ = 4.0f,
int original_max_position_embeddings_ = 8192)
: apply_rotary_embedding(apply_rotary_embedding_),
rope_theta(rope_theta_), rope_type(rope_type_), factor(factor_),
low_freq_factor(low_freq_factor_), high_freq_factor(high_freq_factor_),
original_max_position_embeddings(original_max_position_embeddings_) {}

friend std::ostream &operator<<(std::ostream &os,
RotaryEmbeddingMeta const &meta) {
os << std::boolalpha // To print bool as true/false instead of 1/0
<< "RotaryEmbeddingMeta {\n"
<< " apply_rotary_embedding: " << meta.apply_rotary_embedding << ",\n"
<< " rope_theta: " << meta.rope_theta << ",\n"
<< " rope_type: \"" << meta.rope_type << "\",\n"
<< " factor: " << meta.factor << ",\n"
<< " low_freq_factor: " << meta.low_freq_factor << ",\n"
<< " high_freq_factor: " << meta.high_freq_factor << ",\n"
<< " original_max_position_embeddings: "
<< meta.original_max_position_embeddings << "\n"
<< "}";
return os;
}
};

std::string join_path(std::vector<std::string> const &paths);

Expand Down
3 changes: 3 additions & 0 deletions include/flexflow/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@ class Layer {
void add_float_property(std::string const &key, float value);
void add_int_vector_property(std::string const &key,
std::vector<int> const &value);
void add_string_property(std::string const &key, std::string const &value);
void add_initializer(std::string const &key, Initializer *initializer);
bool get_int_property(std::string const &key, long long &value) const;
bool get_float_property(std::string const &key, float &value) const;
bool get_int_vector_property(std::string const &key,
std::vector<int> &value) const;
bool get_string_property(std::string const &key, std::string &value) const;
bool get_initializer(std::string const &key, Initializer *&initializer) const;
Tensor get_parameter(int index);
void print();
Expand All @@ -59,6 +61,7 @@ class Layer {
std::unordered_map<std::string, float> float_properties;
std::unordered_map<std::string, Initializer *> initializers;
std::unordered_map<std::string, std::vector<int>> int_vector_properties;
std::unordered_map<std::string, std::string> string_properties;
};

}; // namespace FlexFlow
12 changes: 6 additions & 6 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand All @@ -741,7 +741,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand All @@ -761,7 +761,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand All @@ -780,7 +780,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand All @@ -801,7 +801,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand All @@ -822,7 +822,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand Down
12 changes: 6 additions & 6 deletions include/flexflow/ops/inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class IncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand All @@ -65,7 +65,7 @@ class IncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand Down Expand Up @@ -131,8 +131,8 @@ class IncMultiHeadSelfAttention : public Op {
int num_q_heads, num_kv_heads, num_hidden_layers, tensor_parallelism_degree;
float dropout, scaling_factor;
bool qkv_bias;
bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
qk_prod_scaling, position_bias;
bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
RotaryEmbeddingMeta rotary_embedding_meta;
int hidden_size, qk_dim, v_dim, o_dim;
int qoSeqLength, kvSeqLength;
DataType quantization_type;
Expand All @@ -156,7 +156,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
int _qk_dim,
int _v_dim,
int _o_dim,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _qkv_bias,
bool _scaling_query,
bool _qk_prod_scaling,
Expand Down Expand Up @@ -185,7 +185,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads, num_hidden_layers,
local_hidden_size;
bool *has_load_weights;
bool *apply_rotary_embedding;
RotaryEmbeddingMeta *rotary_embedding_meta;
bool *qkv_bias;
bool *final_bias;
bool *scaling_query;
Expand Down
6 changes: 4 additions & 2 deletions include/flexflow/ops/inc_multihead_self_attention_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "flexflow/ffconst.h"
#include "flexflow/fftype.h"
#include "flexflow/inference.h"
#include "flexflow/parallel_tensor.h"

namespace FlexFlow {
Expand All @@ -12,8 +13,9 @@ struct IncMultiHeadSelfAttentionParams {
int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, num_hidden_layers,
tensor_parallelism_degree;
float dropout, scaling_factor;
bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
scaling_query, qk_prod_scaling, position_bias;
bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling,
position_bias;
RotaryEmbeddingMeta rotary_embedding_meta;
DataType quantization_type;
bool offload, streaming_cache;
char name[MAX_OPNAME];
Expand Down
8 changes: 4 additions & 4 deletions include/flexflow/ops/spec_inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand All @@ -58,7 +58,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand Down Expand Up @@ -124,8 +124,8 @@ class SpecIncMultiHeadSelfAttention : public Op {
int num_q_heads, num_kv_heads, num_hidden_layers, tensor_parallelism_degree;
float dropout, scaling_factor;
bool qkv_bias;
bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
qk_prod_scaling, position_bias;
bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
RotaryEmbeddingMeta rotary_embedding_meta;
int hidden_size, qk_dim, v_dim, o_dim;
int qoSeqLength, kvSeqLength;
bool streaming_cache;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ struct SpecIncMultiHeadSelfAttentionParams {
LayerID layer_guid;
int embed_dim, num_q_heads, num_kv_heads, num_hidden_layers, kdim, vdim;
float dropout, scaling_factor;
bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
scaling_query, qk_prod_scaling, position_bias;
bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling,
position_bias;
RotaryEmbeddingMeta rotary_embedding_meta;
bool streaming_cache;
char name[MAX_OPNAME];
bool is_valid(ParallelTensorShape const &) const;
Expand Down
8 changes: 4 additions & 4 deletions include/flexflow/ops/tree_inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand All @@ -60,7 +60,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand Down Expand Up @@ -126,8 +126,8 @@ class TreeIncMultiHeadSelfAttention : public Op {
int num_q_heads, num_kv_heads, num_hidden_layers, tensor_parallelism_degree;
float dropout, scaling_factor;
bool qkv_bias;
bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
qk_prod_scaling, position_bias;
bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
RotaryEmbeddingMeta rotary_embedding_meta;
int hidden_size, qk_dim, v_dim, o_dim;
int qoSeqLength, kvSeqLength;
DataType quantization_type;
Expand Down
Loading

0 comments on commit 7ba77dd

Please sign in to comment.