Skip to content

Commit

Permalink
fix python chatgpt.json
Browse files Browse the repository at this point in the history
  • Loading branch information
xinhaoc committed Jan 14, 2024
1 parent 8bfaf6a commit 8db2650
Show file tree
Hide file tree
Showing 10 changed files with 43 additions and 25 deletions.
9 changes: 5 additions & 4 deletions inference/models/falcon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,11 @@ void FALCON::create_falcon_model(FFModel &ff,
Tensor input;
{
// assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
int const token_dims[] = {mode == TREE_VERIFY_MODE
? BatchConfig::max_verify_tokens_per_batch()
: BatchConfig::max_tokens_per_batch(),
1};
int const token_dims[] = {
(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
? BatchConfig::max_verify_tokens_per_batch()
: BatchConfig::max_tokens_per_batch(),
1};
input = ff.create_tensor<2>(token_dims, DT_INT32);
}

Expand Down
9 changes: 5 additions & 4 deletions inference/models/llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ void LLAMA::create_llama_model(FFModel &ff,

Tensor input;
{
int const token_dims[] = {mode == TREE_VERIFY_MODE
? BatchConfig::max_verify_tokens_per_batch()
: BatchConfig::max_tokens_per_batch(),
1};
int const token_dims[] = {
(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
? BatchConfig::max_verify_tokens_per_batch()
: BatchConfig::max_tokens_per_batch(),
1};
input = ff.create_tensor<2>(token_dims, DT_INT32);
}

Expand Down
9 changes: 5 additions & 4 deletions inference/models/mpt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ void MPT::create_mpt_model(FFModel &ff,
//------------------------------ build the model --------------------------
Tensor input;
{
int const token_dims[] = {mode == TREE_VERIFY_MODE
? BatchConfig::max_verify_tokens_per_batch()
: BatchConfig::max_tokens_per_batch(),
1};
int const token_dims[] = {
(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
? BatchConfig::max_verify_tokens_per_batch()
: BatchConfig::max_tokens_per_batch(),
1};
input = ff.create_tensor<2>(token_dims, DT_INT32);
}

Expand Down
9 changes: 5 additions & 4 deletions inference/models/opt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,11 @@ void OPT::create_opt_model(FFModel &ff,
Tensor position_input;
ff.set_position_offset(2);
{
int const token_dims[] = {mode == TREE_VERIFY_MODE
? BatchConfig::max_verify_tokens_per_batch()
: BatchConfig::max_tokens_per_batch(),
1};
int const token_dims[] = {
(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
? BatchConfig::max_verify_tokens_per_batch()
: BatchConfig::max_tokens_per_batch(),
1};
input = ff.create_tensor<2>(token_dims, DT_INT32);
position_input = ff.create_tensor<2>(token_dims, DT_INT32);
}
Expand Down
9 changes: 5 additions & 4 deletions inference/models/starcoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@ void STARCODER::create_starcoder_model(
ff.set_position_offset(0);
{
// assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
int const token_dims[] = {mode == TREE_VERIFY_MODE
? BatchConfig::max_verify_tokens_per_batch()
: BatchConfig::max_tokens_per_batch(),
1};
int const token_dims[] = {
(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
? BatchConfig::max_verify_tokens_per_batch()
: BatchConfig::max_tokens_per_batch(),
1};
input = ff.create_tensor<2>(token_dims, DT_INT32);
position_input = ff.create_tensor<2>(token_dims, DT_INT32);
}
Expand Down
4 changes: 3 additions & 1 deletion python/flexflow/serve/models/falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(self, hf_config):
#self.max_num_tokens = 64
self.max_beam_width = 1
self.max_beam_depth = 8
self.max_spec_tree_token_num = 64
self.bias = hf_config.bias
self.hidden_size = hf_config.hidden_size
self.layer_norm_epsilon = hf_config.layer_norm_epsilon
Expand Down Expand Up @@ -70,6 +71,7 @@ def __init__(
self.weights_filepath = weights_filepath
self.tokenizer_filepath = tokenizer_filepath
self.maxint = 2**31 - 1
max_verify_tokens_per_batch = max_tokens_per_batch + self.falcon_config.max_spec_tree_token_num

# Sanity checks
if self.falcon_config.hidden_size % self.falcon_config.n_head != 0:
Expand All @@ -84,7 +86,7 @@ def __init__(
f"Number of q attention heads ({self.falcon_config.n_head}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})"
)

self.build_model(max_tokens_per_batch)
self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch)

def build_model(self, max_tokens_per_batch):
ffmodel = FFModel(self.ffconfig)
Expand Down
5 changes: 4 additions & 1 deletion python/flexflow/serve/models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(self, hf_config):
#self.max_num_tokens = 64
self.max_beam_width = 1
self.max_beam_depth = 8
self.max_spec_tree_token_num = 64
self.num_hidden_layers = hf_config.num_hidden_layers
self.vocab_size = hf_config.vocab_size
self.hidden_size = hf_config.hidden_size
Expand Down Expand Up @@ -62,6 +63,8 @@ def __init__(
self.weights_filepath = weights_filepath
self.tokenizer_filepath = tokenizer_filepath
self.maxint = 2**31 - 1
max_verify_tokens_per_batch = max_tokens_per_batch + self.llama_config.max_spec_tree_token_num


# Sanity checks
if self.llama_config.hidden_size % self.llama_config.num_attention_heads != 0:
Expand All @@ -81,7 +84,7 @@ def __init__(
f"Number of attention heads ({self.llama_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})"
)

self.build_model(max_tokens_per_batch)
self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch)

def build_model(self, max_tokens_per_batch):
ffmodel = FFModel(self.ffconfig)
Expand Down
5 changes: 4 additions & 1 deletion python/flexflow/serve/models/mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(self, hf_config):
#self.max_num_tokens = 64
self.max_beam_width = 1
self.max_beam_depth = 8
self.max_spec_tree_token_num = 64
self.hidden_size = hf_config.d_model
self.n_heads = hf_config.n_heads
self.n_layers = hf_config.n_layers
Expand Down Expand Up @@ -57,6 +58,8 @@ def __init__(
self.weights_filepath = weights_filepath
self.tokenizer_filepath = tokenizer_filepath
self.maxint = 2**31 - 1
max_verify_tokens_per_batch = max_tokens_per_batch + self.mpt_config.max_spec_tree_token_num


# Sanity checks
if self.mpt_config.hidden_size % self.mpt_config.n_heads != 0:
Expand All @@ -72,7 +75,7 @@ def __init__(
raise ValueError(
f"Number of attention heads ({self.mpt_config.n_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})"
)
self.build_model(max_tokens_per_batch)
self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch)

def build_model(self, max_tokens_per_batch):
ffmodel = FFModel(self.ffconfig)
Expand Down
4 changes: 3 additions & 1 deletion python/flexflow/serve/models/opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(self, hf_config):
#self.max_num_tokens = 64
self.max_beam_width = 1
self.max_beam_depth = 8
self.max_spec_tree_token_num = 64
self.do_layer_norm_before = hf_config.do_layer_norm_before
self.dropout = hf_config.dropout
self.enable_bias = hf_config.enable_bias
Expand Down Expand Up @@ -63,6 +64,7 @@ def __init__(
self.weights_filepath = weights_filepath
self.tokenizer_filepath = tokenizer_filepath
self.maxint = 2**31 - 1
max_verify_tokens_per_batch = max_tokens_per_batch + self.opt_config.max_spec_tree_token_num

# Sanity checks
if self.opt_config.hidden_size % self.opt_config.num_attention_heads != 0:
Expand All @@ -82,7 +84,7 @@ def __init__(
f"Number of attention heads ({self.opt_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})"
)

self.build_model(max_tokens_per_batch)
self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch)

def build_model(self, max_tokens_per_batch):
ffmodel = FFModel(self.ffconfig)
Expand Down
5 changes: 4 additions & 1 deletion python/flexflow/serve/models/starcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(self, hf_config):
#self.max_num_tokens = 64
self.max_beam_width = 1
self.max_beam_depth = 8
self.max_spec_tree_token_num = 64
self.dropout_p = hf_config.attn_pdrop
self.hidden_size = hf_config.n_embd
self.layer_norm_epsilon = hf_config.layer_norm_epsilon
Expand Down Expand Up @@ -61,6 +62,8 @@ def __init__(
self.weights_filepath = weights_filepath
self.tokenizer_filepath = tokenizer_filepath
self.maxint = 2**31 - 1
max_verify_tokens_per_batch = max_tokens_per_batch + self.starcoder_config.max_spec_tree_token_num


# Sanity checks
if (
Expand All @@ -84,7 +87,7 @@ def __init__(
f"Number of attention heads ({self.starcoder_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})"
)

self.build_model(max_tokens_per_batch)
self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch)

def build_model(self, max_tokens_per_batch):
ffmodel = FFModel(self.ffconfig)
Expand Down

0 comments on commit 8db2650

Please sign in to comment.