diff --git a/src/ops/fused.cu b/src/ops/fused.cu index ef6c856871..02a4995b0f 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -235,13 +235,15 @@ __host__ void FusedOp::forward_task(Task const *task, out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); float const *bias_ptr = nullptr; + LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); - bias_ptr = my_weight_accessor[1].get_float_ptr(); + if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { + bias_ptr = my_weight_accessor[1].get_float_ptr(); + } } else { assert(fused->op_num_weights[op] == 1); } - LinearMeta *m = (LinearMeta *)metas->meta[op]; Kernels::Linear::forward_kernel_wrapper( m, my_input_accessor[0].get_float_ptr(), @@ -604,13 +606,15 @@ __host__ void out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); void const *bias_ptr = nullptr; + LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); - bias_ptr = my_weight_accessor[1].ptr; + if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { + bias_ptr = my_weight_accessor[1].ptr; + } } else { assert(fused->op_num_weights[op] == 1); } - LinearMeta *m = (LinearMeta *)metas->meta[op]; assert(m->input_type[0] == my_input_accessor[0].data_type); assert(m->input_type[0] == my_output_accessor[0].data_type); batch_size = bc->num_active_tokens(); diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index f50d374633..8616bb845e 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -87,8 +87,10 @@ fi if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (small model, half precision) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (big model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 @@ -97,8 +99,10 @@ if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # OPT (small model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (small model, half precision) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (big model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2