From 64c258f3b43e19025889d728799d2bdedde9f732 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 27 Sep 2024 11:59:38 -0700 Subject: [PATCH 1/3] [FusedOp] Fix segment fault (#1511) * minor bug fix * fix --- src/ops/fused.cu | 69 ++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index cab28181da..8f1212beb4 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1678,77 +1678,77 @@ __host__ void FusedOp::backward_task(Task const *task, int sum = fused->numInputs + fused->numWeights + fused->numOutputs; assert(sum * 2 == (int)regions.size()); } - GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; + std::vector input_accessor; + std::vector input_grad_accessor; + std::vector weight_accessor; + std::vector weight_grad_accessor; + std::vector output_accessor; + std::vector output_grad_accessor; int roff = 0; assert(fused->numInputs <= MAX_NUM_INPUTS); for (int i = 0; i < fused->numInputs; i++) { - input_accessor[i] = + input_accessor.push_back( helperGetGenericTensorAccessorRO(fused->input_data_types[i], regions[i], task->regions[i], FID_DATA, ctx, - runtime); + runtime)); } roff += fused->numInputs; assert(fused->numWeights <= MAX_NUM_WEIGHTS); for (int i = 0; i < fused->numWeights; i++) { - weight_accessor[i] = + weight_accessor.push_back( helperGetGenericTensorAccessorRO(fused->weight_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, ctx, - runtime); + runtime)); } roff += fused->numWeights; assert(fused->numOutputs <= MAX_NUM_OUTPUTS); for (int i = 0; i < fused->numOutputs; i++) { - output_accessor[i] = + output_accessor.push_back( helperGetGenericTensorAccessorRO(fused->output_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, ctx, - runtime); + runtime)); } roff += fused->numOutputs; for (int i = 0; i < fused->numInputs; i++) { - input_grad_accessor[i] = + input_grad_accessor.push_back( helperGetGenericTensorAccessorRW(fused->input_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, ctx, - runtime); + runtime)); assert(input_grad_accessor[i].domain == input_accessor[i].domain); } roff += fused->numInputs; for (int i = 0; i < fused->numWeights; i++) { - weight_grad_accessor[i] = + weight_grad_accessor.push_back( helperGetGenericTensorAccessorRW(fused->weight_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, ctx, - runtime); + runtime)); assert(weight_grad_accessor[i].domain.get_volume() == weight_accessor[i].domain.get_volume()); } roff += fused->numWeights; for (int i = 0; i < fused->numOutputs; i++) { - output_grad_accessor[i] = + output_grad_accessor.push_back( helperGetGenericTensorAccessorRW(fused->output_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, ctx, - runtime); + runtime)); assert(output_grad_accessor[i].domain == output_accessor[i].domain); } roff += fused->numOutputs; @@ -1767,12 +1767,6 @@ __host__ void FusedOp::backward_task(Task const *task, } int ioff = 0, woff = 0, ooff = 0; - GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; // Do backpropagation in the reverse ordering for (int op = 0; op < fused->numOperators; op++) { ioff += fused->op_num_inputs[op]; @@ -1781,18 +1775,24 @@ __host__ void FusedOp::backward_task(Task const *task, } for (int op = fused->numOperators - 1; op >= 0; op--) { + std::vector my_input_accessor; + std::vector my_weight_accessor; + std::vector my_output_accessor; + std::vector my_input_grad_accessor; + std::vector my_weight_grad_accessor; + std::vector my_output_grad_accessor; ioff -= fused->op_num_inputs[op]; woff -= fused->op_num_weights[op]; ooff -= fused->op_num_outputs[op]; for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - my_input_accessor[i] = input_accessor[my_off]; - my_input_grad_accessor[i] = input_grad_accessor[my_off]; + my_input_accessor.push_back(input_accessor[my_off]); + my_input_grad_accessor.push_back(input_grad_accessor[my_off]); assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - my_input_accessor[i] = output_accessor[my_off]; - my_input_grad_accessor[i] = output_grad_accessor[my_off]; + my_input_accessor.push_back(output_accessor[my_off]); + my_input_grad_accessor.push_back(output_grad_accessor[my_off]); assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); } else { assert(false); @@ -1800,17 +1800,18 @@ __host__ void FusedOp::backward_task(Task const *task, } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; - my_weight_grad_accessor[i] = - weight_grad_accessor[fused->op_weight_idx[i + woff]]; + my_weight_accessor.push_back( + weight_accessor[fused->op_weight_idx[i + woff]]); + my_weight_grad_accessor.push_back( + weight_grad_accessor[fused->op_weight_idx[i + woff]]); assert(my_weight_grad_accessor[i].domain.get_volume() == my_weight_accessor[i].domain.get_volume()); } for (int i = 0; i < fused->op_num_outputs[op]; i++) { assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); int my_off = fused->op_output_idx[i + ooff]; - my_output_accessor[i] = output_accessor[my_off]; - my_output_grad_accessor[i] = output_grad_accessor[my_off]; + my_output_accessor.push_back(output_accessor[my_off]); + my_output_grad_accessor.push_back(output_grad_accessor[my_off]); assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); } switch (fused->op_op_type[op]) { @@ -1880,7 +1881,7 @@ __host__ void FusedOp::backward_task(Task const *task, int num_inputs = fused->op_num_inputs[op]; Kernels::Concat::backward_kernel_wrapper(m, my_output_grad_accessor[0], - my_input_grad_accessor, + my_input_grad_accessor.data(), num_inputs, m->legion_axis); break; From c78cf04d348aa242c891c783e880e90806c88344 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 1 Oct 2024 20:03:18 -0700 Subject: [PATCH 2/3] enable disabling inference (#1516) --- .github/workflows/build.yml | 12 ++-- .github/workflows/gpu-ci.yml | 6 +- CMakeLists.txt | 105 ++++++++++++----------------------- config/config.inc | 20 +++---- config/config.linux | 6 +- spack/package.py | 4 +- src/c/flexflow_c.cc | 12 ++++ src/ops/beam_topk.cu | 2 +- src/runtime/model.cc | 4 ++ 9 files changed, 77 insertions(+), 94 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ef5961bc87..63e0b9037a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -79,13 +79,13 @@ jobs: export FF_CUDA_ARCH=70 export FF_HIP_ARCH=gfx1100,gfx1036 export hip_version=5.6 - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + export FF_BUILD_INFERENCE=ON if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export FF_BUILD_ALL_EXAMPLES=ON + export FF_BUILD_TRAINING_EXAMPLES=ON export FF_BUILD_UNIT_TESTS=ON else - export FF_BUILD_ALL_EXAMPLES=OFF + export FF_BUILD_TRAINING_EXAMPLES=OFF export FF_BUILD_UNIT_TESTS=OFF fi @@ -106,13 +106,13 @@ jobs: export FF_CUDA_ARCH=70 export FF_HIP_ARCH=gfx1100,gfx1036 export hip_version=5.6 - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + export FF_BUILD_INFERENCE=ON if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export FF_BUILD_ALL_EXAMPLES=ON + export FF_BUILD_TRAINING_EXAMPLES=ON export FF_BUILD_UNIT_TESTS=ON else - export FF_BUILD_ALL_EXAMPLES=OFF + export FF_BUILD_TRAINING_EXAMPLES=OFF export FF_BUILD_UNIT_TESTS=OFF fi diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 00ca2df603..6ca50027d1 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -175,7 +175,7 @@ jobs: export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + export FF_BUILD_INFERENCE=ON mkdir build cd build ../config/config.linux @@ -262,8 +262,8 @@ jobs: run: | export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) - export FF_BUILD_ALL_EXAMPLES=ON - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + export FF_BUILD_TRAINING_EXAMPLES=ON + export FF_BUILD_INFERENCE=ON export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion pip install . --verbose diff --git a/CMakeLists.txt b/CMakeLists.txt index f06969ae04..4e24e1e54b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,6 +181,14 @@ include(legion) # Not build FlexFlow if BUILD_LEGION_ONLY is ON if(NOT BUILD_LEGION_ONLY) + + # build binary options + option(FF_BUILD_INFERENCE "build all inference code and examples." ON) + option(FF_BUILD_TRAINING_EXAMPLES "build all training examples." OFF) + option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) + option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF) + option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF) + # NCCL if(FF_USE_NCCL) if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda") @@ -271,18 +279,23 @@ if(NOT BUILD_LEGION_ONLY) file(GLOB_RECURSE FLEXFLOW_HDR LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/include/*.h) - - #list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h) file(GLOB_RECURSE FLEXFLOW_SRC LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/src/*.cc) - list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc") - #list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc) - set(FLEXFLOW_CPP_DRV_SRC - ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc) + # exclude inference files if FF_BUILD_INFERENCE is off + if(NOT FF_BUILD_INFERENCE) + list(REMOVE_ITEM FLEXFLOW_HDR "${FLEXFLOW_ROOT}/include/request_manager.h") + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/request_manager.cc") + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/inference_manager.cc") + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/batch_config.cc") + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/beam_search_batch_config.cc") + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/tree_verify_batch_config.cc") + endif() + + set(FLEXFLOW_CPP_DRV_SRC ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc) add_library(substitution_loader SHARED ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc) @@ -297,6 +310,10 @@ if(NOT BUILD_LEGION_ONLY) file(GLOB_RECURSE FLEXFLOW_GPU_SRC LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/src/*.cu) + + if(NOT FF_BUILD_INFERENCE) + list(REMOVE_ITEM FLEXFLOW_GPU_SRC "${FLEXFLOW_ROOT}/src/runtime/request_manager.cu") + endif() add_compile_definitions(FF_USE_CUDA) @@ -452,27 +469,6 @@ if(NOT BUILD_LEGION_ONLY) set_property(TARGET flexflow PROPERTY CXX_STANDARD 14) endif() - # build binary - option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" OFF) - option(FF_BUILD_RESNET "build resnet example" OFF) - option(FF_BUILD_RESNEXT "build resnext example" OFF) - option(FF_BUILD_ALEXNET "build alexnet example" OFF) - option(FF_BUILD_DLRM "build DLRM example" OFF) - option(FF_BUILD_XDL "build XDL example" OFF) - option(FF_BUILD_INCEPTION "build inception example" OFF) - option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF) - option(FF_BUILD_TRANSFORMER "build transformer example" OFF) - option(FF_BUILD_MOE "build mixture of experts example" OFF) - option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF) - option(FF_BUILD_SPLIT_TEST "build split test example" OFF) - option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF) - option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF) - option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF) - option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF) - option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) - option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF) - option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF) - if(FF_BUILD_UNIT_TESTS) set(BUILD_GMOCK OFF) add_subdirectory(deps/googletest) @@ -488,89 +484,60 @@ if(NOT BUILD_LEGION_ONLY) add_subdirectory(tools/substitutions_to_dot) endif() - if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) + if(FF_BUILD_INFERENCE) + add_compile_definitions(FF_BUILD_INFERENCE) # Ensure Rust is installed execute_process(COMMAND rustc --version RESULT_VARIABLE RUST_COMMAND_RESULT OUTPUT_VARIABLE RUSTC_OUTPUT ERROR_QUIET) if(NOT RUST_COMMAND_RESULT EQUAL 0) - message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + message(FATAL_ERROR + "Rust is not installed on the system. Please install it by running: \n" + "'curl https://sh.rustup.rs -sSf | sh -s -- -y' \n" + "and follow the instructions on the screen.") endif() # Ensure Cargo is installed execute_process(COMMAND cargo --version RESULT_VARIABLE CARGO_RESULT OUTPUT_QUIET ERROR_QUIET) if(NOT CARGO_RESULT EQUAL 0) - message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + message(FATAL_ERROR + "Rust is installed, but cargo is not. Please install it by running: \n" + "'curl https://sh.rustup.rs -sSf | sh -s -- -y' \n" + "and follow the instructions on the screen.") endif() set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER ON) add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) target_link_libraries(flexflow tokenizers_cpp) endif() - if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES) + + if (FF_BUILD_TRAINING_EXAMPLES) add_subdirectory(examples/cpp/ResNet) - endif() - - if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/resnext50) - endif() - - if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/AlexNet) - endif() - - if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/MLP_Unify) - endif() - - if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/split_test) - endif() - - if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/split_test_2) - endif() - - if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/InceptionV3) - endif() - - #TODO: Once functional add to BUILD_ALL_EXAMPLES - if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/candle_uno) - endif() - - if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/DLRM) - #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc) #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) - #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc) #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) - endif() - - if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/XDL) - endif() - - if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/Transformer) - endif() - - if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/mixture_of_experts) endif() - if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + if(FF_BUILD_INFERENCE) add_subdirectory(inference/spec_infer) add_subdirectory(inference/incr_decoding) add_subdirectory(inference/peft) endif() - # installation set(INCLUDE_DEST "include") set(LIB_DEST "lib") diff --git a/config/config.inc b/config/config.inc index 6431eaf136..011fe890fb 100644 --- a/config/config.inc +++ b/config/config.inc @@ -128,19 +128,19 @@ elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then fi # build C++ examples -if [ "$FF_BUILD_ALL_EXAMPLES" = "ON" ]; then - SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON" -elif [ "$FF_BUILD_ALL_EXAMPLES" = "OFF" ]; then - SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=OFF" +if [ "$FF_BUILD_TRAINING_EXAMPLES" = "ON" ]; then + SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=ON" +elif [ "$FF_BUILD_TRAINING_EXAMPLES" = "OFF" ]; then + SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=OFF" else - SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON" + SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=ON" fi -if [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "ON" ]; then - SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON" -elif [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "OFF" ]; then - SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=OFF" +if [ "$FF_BUILD_INFERENCE" = "ON" ]; then + SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=ON" +elif [ "$FF_BUILD_INFERENCE" = "OFF" ]; then + SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=OFF" else - SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON" + SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=ON" fi # enable C++ unit tests diff --git a/config/config.linux b/config/config.linux index acffc210f5..09976cfa03 100755 --- a/config/config.linux +++ b/config/config.linux @@ -65,8 +65,8 @@ FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} UCX_DIR=${UCX_DIR:-""} # build C++ examples -FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} -FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON} +FF_BUILD_TRAINING_EXAMPLES=${FF_BUILD_TRAINING_EXAMPLES:-OFF} +FF_BUILD_INFERENCE=${FF_BUILD_INFERENCE:-ON} # build C++ unit tests FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} @@ -108,7 +108,7 @@ fi function get_build_configs() { # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_TRAINING_EXAMPLES=${FF_BUILD_TRAINING_EXAMPLES} FF_BUILD_INFERENCE=${FF_BUILD_INFERENCE} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" } if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then diff --git a/spack/package.py b/spack/package.py index 273cb30951..12ff294e94 100644 --- a/spack/package.py +++ b/spack/package.py @@ -91,9 +91,9 @@ def cmake_args(self): options.append('-DFF_USE_NCCL=OFF') if '+examples' in spec: - options.append('-DFF_BUILD_ALL_EXAMPLES=ON') + options.append('-DFF_BUILD_TRAINING_EXAMPLES=ON') else: - options.append('-DFF_BUILD_ALL_EXAMPLES=OFF') + options.append('-DFF_BUILD_TRAINING_EXAMPLES=OFF') if '+avx2' in spec: options.append('-DFF_USE_AVX2=ON') diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index e39cb29037..532dd00198 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -16,7 +16,9 @@ #include "flexflow/flexflow_c.h" #include "flexflow/dataloader.h" #include "flexflow/mapper.h" +#ifdef FF_BUILD_INFERENCE #include "flexflow/request_manager.h" +#endif #include "flexflow/utils/file_loader.h" using namespace Legion; @@ -58,6 +60,7 @@ class FFCObjectWrapper { FF_NEW_OPAQUE_WRAPPER(flexflow_dlrm_config_t, DLRMConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_single_dataloader_t, SingleDataLoader *); // inference +#ifdef FF_BUILD_INFERENCE FF_NEW_OPAQUE_WRAPPER(flexflow_batch_config_t, BatchConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t, TreeVerifyBatchConfig *); @@ -74,6 +77,7 @@ class FFCObjectWrapper { // LoraAdamOptimizerConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *); +#endif }; Logger ffc_log("flexflow_c"); @@ -1549,6 +1553,7 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } +#ifdef FF_BUILD_INFERENCE flexflow_peft_model_id_t flexflow_model_add_lora_layer( flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_) { @@ -1563,6 +1568,7 @@ flexflow_peft_model_id_t flexflow_model_add_lora_layer( peft_model_id); return FFCObjectWrapper::wrap(peft_model_id); } +#endif void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_, flexflow_sgd_optimizer_t optimizer_) { @@ -1617,6 +1623,7 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { handle->set_transformer_layer_id(id); } +#ifdef FF_BUILD_INFERENCE void flexflow_model_generate(flexflow_model_t handle_, int num_requests, enum RequestType *request_types, @@ -1697,6 +1704,7 @@ void flexflow_model_generate(flexflow_model_t handle_, } } } +#endif void flexflow_model_set_position_offset(flexflow_model_t handle_, int const offset) { @@ -2584,6 +2592,8 @@ void flexflow_perform_registration(void) { true /*global*/); } +#ifdef FF_BUILD_INFERENCE + // ----------------------------------------------------------------------- // BatchConfig // ----------------------------------------------------------------------- @@ -3052,3 +3062,5 @@ void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) { DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id); delete peft_model_id; } + +#endif diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index bf4c23cad0..a7aee338e4 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -15,7 +15,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/ops/beam_topk.h" -#include "flexflow/request_manager.h" +// #include "flexflow/request_manager.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index ceb9277b76..5213633e73 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -71,7 +71,9 @@ #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" +#ifdef FF_BUILD_INFERENCE #include "flexflow/request_manager.h" +#endif #include "flexflow/substitution.h" #include "flexflow/utils/random_utils.h" #include "flexflow/utils/test_utils.h" @@ -4684,6 +4686,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } +#ifdef FF_BUILD_INFERENCE // RequestManager load_tokens { TaskVariantRegistrar registrar(RM_LOAD_TOKENS_TASK_ID, @@ -4837,6 +4840,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } +#endif // ElementUnary task { TaskVariantRegistrar registrar(ELEMENTUNARY_INIT_TASK_ID, From ca3dabf7d23cf2173fca830249c4cb9eeb6171bf Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 5 Oct 2024 11:36:34 -0700 Subject: [PATCH 3/3] [AllReduce] make AllReduce tasks concurrent in FlexFlow (#1517) * minor bug fix * make AllReduce tasks concurrent * set concurrent=true for remaining operators --------- Co-authored-by: Gabriele Oliaro --- src/ops/fused.cc | 6 ++++++ src/ops/lora_linear.cc | 2 ++ src/parallel_ops/allreduce.cc | 5 +++++ src/parallel_ops/parallel_identity.cc | 4 ++++ src/runtime/model.cc | 23 +++++++++++++++++++++++ 5 files changed, 40 insertions(+) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 121139beb1..720d678a4a 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -476,6 +476,7 @@ void FusedOp::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); switch (domain.get_dim()) { @@ -570,6 +571,7 @@ void FusedOp::init_inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); switch (domain.get_dim()) { @@ -604,6 +606,7 @@ void FusedOp::forward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; int offset = 0; for (int i = 0; i < numInputs; i++) { assert(inputs[i]->part != LogicalPartition::NO_PART); @@ -659,6 +662,7 @@ FutureMap FusedOp::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); int offset = 0; for (int i = 0; i < numInputs; i++) { @@ -735,6 +739,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); int offset = 0; for (int i = 0; i < numInputs; i++) { @@ -787,6 +792,7 @@ void FusedOp::backward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; int idx = 0; for (int i = 0; i < numInputs; i++) { launcher.add_region_requirement(RegionRequirement(inputs[i]->part, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index fde6bc2b28..513147f3b7 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -296,6 +296,7 @@ void LoraLinear::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -795,6 +796,7 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 52c4ec2e28..dc43d80133 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -131,6 +131,7 @@ void AllReduce::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -164,6 +165,7 @@ void AllReduce::forward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -212,6 +214,7 @@ void AllReduce::backward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, inputs[0]->machine_view.hash()); + // launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, READ_WRITE, @@ -265,6 +268,7 @@ void AllReduce::init_inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -306,6 +310,7 @@ FutureMap AllReduce::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc index 883910ae09..7d68036709 100644 --- a/src/parallel_ops/parallel_identity.cc +++ b/src/parallel_ops/parallel_identity.cc @@ -133,6 +133,7 @@ void ParallelIdentity::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -214,6 +215,7 @@ void ParallelIdentity::backward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, inputs[0]->machine_view.hash()); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, READ_WRITE, @@ -268,6 +270,7 @@ void ParallelIdentity::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -381,6 +384,7 @@ FutureMap false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 5213633e73..52f1dd2220 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -6888,6 +6888,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "LoraLinear Init Task"); @@ -6919,6 +6920,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "LoraLinear PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "LoraLinear PEFT Backward Task"); @@ -6950,6 +6952,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_INIT_TASK_ID, "FusedOp Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Init Task"); @@ -6964,6 +6967,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Inference Task"); @@ -6979,6 +6983,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "FusedOp PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp PEFT Backward Task"); @@ -6994,6 +6999,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Forward Task"); @@ -7008,6 +7014,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_BWD_TASK_ID, "FusedOp Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Backward Task"); @@ -7244,6 +7251,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce init Task"); @@ -7258,6 +7266,9 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + // AllReduce forward and backward must run concurrently since they + // use ncclAllReduce internally + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Forward Task"); @@ -7272,6 +7283,9 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + // AllReduce forward and backward must run concurrently since they + // use ncclAllReduce internally + // registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Backward Task"); @@ -7287,6 +7301,9 @@ void register_flexflow_internal_tasks(Runtime *runtime, "AllReduce Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + // AllReduce forward and backward must run concurrently since they + // use ncclAllReduce internally + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Inference Task"); @@ -7302,6 +7319,9 @@ void register_flexflow_internal_tasks(Runtime *runtime, "AllReduce PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + // AllReduce forward and backward must run concurrently since they + // use ncclAllReduce internally + // registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce PEFT Backward Task"); @@ -7318,6 +7338,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "ParallelIdentity Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "ParallelIdentity init Task"); @@ -7349,6 +7370,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "ParallelIdentity Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "ParallelIdentity Backward Task"); @@ -7381,6 +7403,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "ParallelIdentity PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "ParallelIdentity PEFT Backward Task");