Skip to content

Commit

Permalink
Merge branch 'inference' into batch-config
Browse files Browse the repository at this point in the history
  • Loading branch information
jiazhihao authored Jul 27, 2023
2 parents 3250585 + aef158a commit 61d674e
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 4 deletions.
12 changes: 8 additions & 4 deletions src/ops/fused.cu
Original file line number Diff line number Diff line change
Expand Up @@ -237,13 +237,15 @@ __host__ void FusedOp::forward_task(Task const *task,
out_dim * batch_size);
assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
float const *bias_ptr = nullptr;
LinearMeta *m = (LinearMeta *)metas->meta[op];
if (fused->op_num_weights[op] == 2) {
assert(my_weight_accessor[1].domain.get_volume() == out_dim);
bias_ptr = my_weight_accessor[1].get_float_ptr();
if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
bias_ptr = my_weight_accessor[1].get_float_ptr();
}
} else {
assert(fused->op_num_weights[op] == 1);
}
LinearMeta *m = (LinearMeta *)metas->meta[op];
Kernels::Linear::forward_kernel_wrapper(
m,
my_input_accessor[0].get_float_ptr(),
Expand Down Expand Up @@ -612,13 +614,15 @@ __host__ void
out_dim * batch_size);
assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
void const *bias_ptr = nullptr;
LinearMeta *m = (LinearMeta *)metas->meta[op];
if (fused->op_num_weights[op] == 2) {
assert(my_weight_accessor[1].domain.get_volume() == out_dim);
bias_ptr = my_weight_accessor[1].ptr;
if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
bias_ptr = my_weight_accessor[1].ptr;
}
} else {
assert(fused->op_num_weights[op] == 1);
}
LinearMeta *m = (LinearMeta *)metas->meta[op];
assert(m->input_type[0] == my_input_accessor[0].data_type);
assert(m->input_type[0] == my_output_accessor[0].data_type);
batch_size = bc->num_active_tokens();
Expand Down
4 changes: 4 additions & 0 deletions tests/inference_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,10 @@ fi
if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
# LLAMA (small model)
../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
# LLAMA (small model, half precision)
../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4

# LLAMA (big model)
../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
Expand All @@ -97,8 +99,10 @@ if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then

# OPT (small model)
../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
# OPT (small model, half precision)
../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4

# OPT (big model)
../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
Expand Down

0 comments on commit 61d674e

Please sign in to comment.