From 64c258f3b43e19025889d728799d2bdedde9f732 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Fri, 27 Sep 2024 11:59:38 -0700
Subject: [PATCH 1/3] [FusedOp] Fix segment fault (#1511)

* minor bug fix

* fix
---
 src/ops/fused.cu | 69 ++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index cab28181da..8f1212beb4 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -1678,77 +1678,77 @@ __host__ void FusedOp::backward_task(Task const *task,
     int sum = fused->numInputs + fused->numWeights + fused->numOutputs;
     assert(sum * 2 == (int)regions.size());
   }
-  GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
-  GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS];
-  GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
-  GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS];
-  GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS];
-  GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS];
+  std::vector<GenericTensorAccessorR> input_accessor;
+  std::vector<GenericTensorAccessorW> input_grad_accessor;
+  std::vector<GenericTensorAccessorR> weight_accessor;
+  std::vector<GenericTensorAccessorW> weight_grad_accessor;
+  std::vector<GenericTensorAccessorR> output_accessor;
+  std::vector<GenericTensorAccessorW> output_grad_accessor;
   int roff = 0;
   assert(fused->numInputs <= MAX_NUM_INPUTS);
   for (int i = 0; i < fused->numInputs; i++) {
-    input_accessor[i] =
+    input_accessor.push_back(
         helperGetGenericTensorAccessorRO(fused->input_data_types[i],
                                          regions[i],
                                          task->regions[i],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
   }
   roff += fused->numInputs;
   assert(fused->numWeights <= MAX_NUM_WEIGHTS);
   for (int i = 0; i < fused->numWeights; i++) {
-    weight_accessor[i] =
+    weight_accessor.push_back(
         helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
   }
   roff += fused->numWeights;
   assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
   for (int i = 0; i < fused->numOutputs; i++) {
-    output_accessor[i] =
+    output_accessor.push_back(
         helperGetGenericTensorAccessorRO(fused->output_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
   }
   roff += fused->numOutputs;
   for (int i = 0; i < fused->numInputs; i++) {
-    input_grad_accessor[i] =
+    input_grad_accessor.push_back(
         helperGetGenericTensorAccessorRW(fused->input_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
     assert(input_grad_accessor[i].domain == input_accessor[i].domain);
   }
   roff += fused->numInputs;
   for (int i = 0; i < fused->numWeights; i++) {
-    weight_grad_accessor[i] =
+    weight_grad_accessor.push_back(
         helperGetGenericTensorAccessorRW(fused->weight_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
     assert(weight_grad_accessor[i].domain.get_volume() ==
            weight_accessor[i].domain.get_volume());
   }
   roff += fused->numWeights;
   for (int i = 0; i < fused->numOutputs; i++) {
-    output_grad_accessor[i] =
+    output_grad_accessor.push_back(
         helperGetGenericTensorAccessorRW(fused->output_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
     assert(output_grad_accessor[i].domain == output_accessor[i].domain);
   }
   roff += fused->numOutputs;
@@ -1767,12 +1767,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
 
   int ioff = 0, woff = 0, ooff = 0;
-  GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
-  GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
-  GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS];
-  GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS];
-  GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS];
-  GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS];
   // Do backpropagation in the reverse ordering
   for (int op = 0; op < fused->numOperators; op++) {
     ioff += fused->op_num_inputs[op];
@@ -1781,18 +1775,24 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
 
   for (int op = fused->numOperators - 1; op >= 0; op--) {
+    std::vector<GenericTensorAccessorR> my_input_accessor;
+    std::vector<GenericTensorAccessorR> my_weight_accessor;
+    std::vector<GenericTensorAccessorR> my_output_accessor;
+    std::vector<GenericTensorAccessorW> my_input_grad_accessor;
+    std::vector<GenericTensorAccessorW> my_weight_grad_accessor;
+    std::vector<GenericTensorAccessorW> my_output_grad_accessor;
     ioff -= fused->op_num_inputs[op];
     woff -= fused->op_num_weights[op];
     ooff -= fused->op_num_outputs[op];
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-        my_input_accessor[i] = input_accessor[my_off];
-        my_input_grad_accessor[i] = input_grad_accessor[my_off];
+        my_input_accessor.push_back(input_accessor[my_off]);
+        my_input_grad_accessor.push_back(input_grad_accessor[my_off]);
         assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain);
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-        my_input_accessor[i] = output_accessor[my_off];
-        my_input_grad_accessor[i] = output_grad_accessor[my_off];
+        my_input_accessor.push_back(output_accessor[my_off]);
+        my_input_grad_accessor.push_back(output_grad_accessor[my_off]);
         assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain);
       } else {
         assert(false);
@@ -1800,17 +1800,18 @@ __host__ void FusedOp::backward_task(Task const *task,
     }
     for (int i = 0; i < fused->op_num_weights[op]; i++) {
       assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-      my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
-      my_weight_grad_accessor[i] =
-          weight_grad_accessor[fused->op_weight_idx[i + woff]];
+      my_weight_accessor.push_back(
+          weight_accessor[fused->op_weight_idx[i + woff]]);
+      my_weight_grad_accessor.push_back(
+          weight_grad_accessor[fused->op_weight_idx[i + woff]]);
       assert(my_weight_grad_accessor[i].domain.get_volume() ==
              my_weight_accessor[i].domain.get_volume());
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
       int my_off = fused->op_output_idx[i + ooff];
-      my_output_accessor[i] = output_accessor[my_off];
-      my_output_grad_accessor[i] = output_grad_accessor[my_off];
+      my_output_accessor.push_back(output_accessor[my_off]);
+      my_output_grad_accessor.push_back(output_grad_accessor[my_off]);
       assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain);
     }
     switch (fused->op_op_type[op]) {
@@ -1880,7 +1881,7 @@ __host__ void FusedOp::backward_task(Task const *task,
         int num_inputs = fused->op_num_inputs[op];
         Kernels::Concat::backward_kernel_wrapper(m,
                                                  my_output_grad_accessor[0],
-                                                 my_input_grad_accessor,
+                                                 my_input_grad_accessor.data(),
                                                  num_inputs,
                                                  m->legion_axis);
         break;

From c78cf04d348aa242c891c783e880e90806c88344 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 1 Oct 2024 20:03:18 -0700
Subject: [PATCH 2/3] enable disabling inference (#1516)

---
 .github/workflows/build.yml  |  12 ++--
 .github/workflows/gpu-ci.yml |   6 +-
 CMakeLists.txt               | 105 ++++++++++++-----------------------
 config/config.inc            |  20 +++----
 config/config.linux          |   6 +-
 spack/package.py             |   4 +-
 src/c/flexflow_c.cc          |  12 ++++
 src/ops/beam_topk.cu         |   2 +-
 src/runtime/model.cc         |   4 ++
 9 files changed, 77 insertions(+), 94 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ef5961bc87..63e0b9037a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -79,13 +79,13 @@ jobs:
           export FF_CUDA_ARCH=70
           export FF_HIP_ARCH=gfx1100,gfx1036
           export hip_version=5.6
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          export FF_BUILD_INFERENCE=ON
 
           if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
-            export FF_BUILD_ALL_EXAMPLES=ON
+            export FF_BUILD_TRAINING_EXAMPLES=ON
             export FF_BUILD_UNIT_TESTS=ON
           else 
-            export FF_BUILD_ALL_EXAMPLES=OFF
+            export FF_BUILD_TRAINING_EXAMPLES=OFF
             export FF_BUILD_UNIT_TESTS=OFF
           fi
 
@@ -106,13 +106,13 @@ jobs:
           export FF_CUDA_ARCH=70
           export FF_HIP_ARCH=gfx1100,gfx1036
           export hip_version=5.6
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          export FF_BUILD_INFERENCE=ON
           
           if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
-            export FF_BUILD_ALL_EXAMPLES=ON
+            export FF_BUILD_TRAINING_EXAMPLES=ON
             export FF_BUILD_UNIT_TESTS=ON
           else 
-            export FF_BUILD_ALL_EXAMPLES=OFF
+            export FF_BUILD_TRAINING_EXAMPLES=OFF
             export FF_BUILD_UNIT_TESTS=OFF
           fi
 
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 00ca2df603..6ca50027d1 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -175,7 +175,7 @@ jobs:
           export PATH=$CONDA_PREFIX/bin:$PATH
           export FF_HOME=$(pwd)
           export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          export FF_BUILD_INFERENCE=ON
           mkdir build
           cd build
           ../config/config.linux
@@ -262,8 +262,8 @@ jobs:
         run: |
           export PATH=$CONDA_PREFIX/bin:$PATH
           export FF_HOME=$(pwd)
-          export FF_BUILD_ALL_EXAMPLES=ON
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          export FF_BUILD_TRAINING_EXAMPLES=ON
+          export FF_BUILD_INFERENCE=ON
           export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
           pip install . --verbose
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f06969ae04..4e24e1e54b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,6 +181,14 @@ include(legion)
 
 # Not build FlexFlow if BUILD_LEGION_ONLY is ON
 if(NOT BUILD_LEGION_ONLY)
+
+  # build binary options
+  option(FF_BUILD_INFERENCE "build all inference code and examples." ON)
+  option(FF_BUILD_TRAINING_EXAMPLES "build all training examples." OFF)
+  option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
+  option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
+  option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
+
   # NCCL
   if(FF_USE_NCCL)
     if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")
@@ -271,18 +279,23 @@ if(NOT BUILD_LEGION_ONLY)
   file(GLOB_RECURSE FLEXFLOW_HDR
     LIST_DIRECTORIES False
     ${FLEXFLOW_ROOT}/include/*.h)
-  
-  #list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h)
 
   file(GLOB_RECURSE FLEXFLOW_SRC
     LIST_DIRECTORIES False
     ${FLEXFLOW_ROOT}/src/*.cc)
-  
   list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc")
-  #list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc)
 
-  set(FLEXFLOW_CPP_DRV_SRC
-    ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
+  # exclude inference files if FF_BUILD_INFERENCE is off
+  if(NOT FF_BUILD_INFERENCE)
+    list(REMOVE_ITEM FLEXFLOW_HDR "${FLEXFLOW_ROOT}/include/request_manager.h")
+    list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/request_manager.cc")
+    list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/inference_manager.cc")
+    list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/batch_config.cc")
+    list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/beam_search_batch_config.cc")
+    list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/tree_verify_batch_config.cc")
+  endif()
+
+  set(FLEXFLOW_CPP_DRV_SRC ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
 
   add_library(substitution_loader SHARED
     ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc)
@@ -297,6 +310,10 @@ if(NOT BUILD_LEGION_ONLY)
     file(GLOB_RECURSE FLEXFLOW_GPU_SRC
       LIST_DIRECTORIES False
       ${FLEXFLOW_ROOT}/src/*.cu)
+    
+    if(NOT FF_BUILD_INFERENCE)
+      list(REMOVE_ITEM FLEXFLOW_GPU_SRC "${FLEXFLOW_ROOT}/src/runtime/request_manager.cu")
+    endif()
 
     add_compile_definitions(FF_USE_CUDA)
 
@@ -452,27 +469,6 @@ if(NOT BUILD_LEGION_ONLY)
     set_property(TARGET flexflow PROPERTY CXX_STANDARD 14)
   endif()
 
-  # build binary
-  option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" OFF)
-  option(FF_BUILD_RESNET "build resnet example" OFF)
-  option(FF_BUILD_RESNEXT "build resnext example" OFF)
-  option(FF_BUILD_ALEXNET "build alexnet example" OFF)
-  option(FF_BUILD_DLRM "build DLRM example" OFF)
-  option(FF_BUILD_XDL "build XDL example" OFF)
-  option(FF_BUILD_INCEPTION "build inception example" OFF)
-  option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF)
-  option(FF_BUILD_TRANSFORMER "build transformer example" OFF)
-  option(FF_BUILD_MOE "build mixture of experts example" OFF)
-  option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF)
-  option(FF_BUILD_SPLIT_TEST "build split test example" OFF)
-  option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF)
-  option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF)
-  option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF)
-  option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF)
-  option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
-  option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
-  option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
-
   if(FF_BUILD_UNIT_TESTS)
     set(BUILD_GMOCK OFF)
     add_subdirectory(deps/googletest)
@@ -488,89 +484,60 @@ if(NOT BUILD_LEGION_ONLY)
       add_subdirectory(tools/substitutions_to_dot)
     endif()
 
-  if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER)
+  if(FF_BUILD_INFERENCE)
+    add_compile_definitions(FF_BUILD_INFERENCE)
     # Ensure Rust is installed
     execute_process(COMMAND rustc --version
                   RESULT_VARIABLE RUST_COMMAND_RESULT
                   OUTPUT_VARIABLE RUSTC_OUTPUT
                   ERROR_QUIET)
     if(NOT RUST_COMMAND_RESULT EQUAL 0)
-      message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+      message(FATAL_ERROR 
+      "Rust is not installed on the system. Please install it by running: \n"
+      "'curl https://sh.rustup.rs -sSf | sh -s -- -y' \n"
+      "and follow the instructions on the screen.")
     endif()
     # Ensure Cargo is installed
     execute_process(COMMAND cargo --version
                     RESULT_VARIABLE CARGO_RESULT
                     OUTPUT_QUIET ERROR_QUIET)
     if(NOT CARGO_RESULT EQUAL 0)
-      message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+      message(FATAL_ERROR 
+      "Rust is installed, but cargo is not. Please install it by running: \n"
+      "'curl https://sh.rustup.rs -sSf | sh -s -- -y' \n"
+      "and follow the instructions on the screen.")
     endif()
     set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER ON)
     add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL)
     target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include)
     target_link_libraries(flexflow tokenizers_cpp)
   endif()
-  if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES)
+  
+  if (FF_BUILD_TRAINING_EXAMPLES)
     add_subdirectory(examples/cpp/ResNet)
-  endif()
-
-  if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/resnext50)
-  endif()
-
-  if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/AlexNet)
-  endif()
-
-  if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/MLP_Unify)
-  endif()
-
-  if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/split_test)
-  endif()
-
-  if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/split_test_2)
-  endif()
-
-  if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/InceptionV3)
-  endif()
-
-  #TODO: Once functional add to BUILD_ALL_EXAMPLES
-  if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/candle_uno)
-  endif()
-
-  if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/DLRM)
-
     #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc)
     #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-
     #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc)
     #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-  endif()
-
-  if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/XDL)
-  endif()
-
-  if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/Transformer)
-  endif()
-
-  if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/mixture_of_experts)
   endif()
 
-  if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
+  if(FF_BUILD_INFERENCE)
     add_subdirectory(inference/spec_infer)
     add_subdirectory(inference/incr_decoding)
     add_subdirectory(inference/peft)
   endif()
 
-
   # installation
   set(INCLUDE_DEST "include")
   set(LIB_DEST "lib")
diff --git a/config/config.inc b/config/config.inc
index 6431eaf136..011fe890fb 100644
--- a/config/config.inc
+++ b/config/config.inc
@@ -128,19 +128,19 @@ elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then
 fi
 
 # build C++ examples
-if [ "$FF_BUILD_ALL_EXAMPLES" = "ON" ]; then
-  SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON"
-elif [ "$FF_BUILD_ALL_EXAMPLES" = "OFF" ]; then
-  SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=OFF"
+if [ "$FF_BUILD_TRAINING_EXAMPLES" = "ON" ]; then
+  SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=ON"
+elif [ "$FF_BUILD_TRAINING_EXAMPLES" = "OFF" ]; then
+  SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=OFF"
 else
-  SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON"
+  SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=ON"
 fi
-if [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "ON" ]; then
-  SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON"
-elif [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "OFF" ]; then
-  SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=OFF"
+if [ "$FF_BUILD_INFERENCE" = "ON" ]; then
+  SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=ON"
+elif [ "$FF_BUILD_INFERENCE" = "OFF" ]; then
+  SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=OFF"
 else
-  SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON"
+  SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=ON"
 fi
 
 # enable C++ unit tests
diff --git a/config/config.linux b/config/config.linux
index acffc210f5..09976cfa03 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -65,8 +65,8 @@ FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}
 UCX_DIR=${UCX_DIR:-""}
 
 # build C++ examples
-FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF}
-FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON}
+FF_BUILD_TRAINING_EXAMPLES=${FF_BUILD_TRAINING_EXAMPLES:-OFF}
+FF_BUILD_INFERENCE=${FF_BUILD_INFERENCE:-ON}
 
 # build C++ unit tests
 FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF}
@@ -108,7 +108,7 @@ fi
 
 function get_build_configs() {
     # Create a string with the values of the variables set in this script
-    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
+    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_TRAINING_EXAMPLES=${FF_BUILD_TRAINING_EXAMPLES} FF_BUILD_INFERENCE=${FF_BUILD_INFERENCE} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
diff --git a/spack/package.py b/spack/package.py
index 273cb30951..12ff294e94 100644
--- a/spack/package.py
+++ b/spack/package.py
@@ -91,9 +91,9 @@ def cmake_args(self):
       options.append('-DFF_USE_NCCL=OFF')
       
     if '+examples' in spec:
-      options.append('-DFF_BUILD_ALL_EXAMPLES=ON')
+      options.append('-DFF_BUILD_TRAINING_EXAMPLES=ON')
     else:
-      options.append('-DFF_BUILD_ALL_EXAMPLES=OFF')
+      options.append('-DFF_BUILD_TRAINING_EXAMPLES=OFF')
       
     if '+avx2' in spec:
       options.append('-DFF_USE_AVX2=ON')
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index e39cb29037..532dd00198 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -16,7 +16,9 @@
 #include "flexflow/flexflow_c.h"
 #include "flexflow/dataloader.h"
 #include "flexflow/mapper.h"
+#ifdef FF_BUILD_INFERENCE
 #include "flexflow/request_manager.h"
+#endif
 #include "flexflow/utils/file_loader.h"
 
 using namespace Legion;
@@ -58,6 +60,7 @@ class FFCObjectWrapper {
   FF_NEW_OPAQUE_WRAPPER(flexflow_dlrm_config_t, DLRMConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_single_dataloader_t, SingleDataLoader *);
   // inference
+#ifdef FF_BUILD_INFERENCE
   FF_NEW_OPAQUE_WRAPPER(flexflow_batch_config_t, BatchConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t,
                         TreeVerifyBatchConfig *);
@@ -74,6 +77,7 @@ class FFCObjectWrapper {
   //                       LoraAdamOptimizerConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *);
+#endif
 };
 
 Logger ffc_log("flexflow_c");
@@ -1549,6 +1553,7 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
   return FFCObjectWrapper::wrap(tensor);
 }
 
+#ifdef FF_BUILD_INFERENCE
 flexflow_peft_model_id_t flexflow_model_add_lora_layer(
     flexflow_model_t handle_,
     const flexflow_lora_linear_config_t peft_config_) {
@@ -1563,6 +1568,7 @@ flexflow_peft_model_id_t flexflow_model_add_lora_layer(
               peft_model_id);
   return FFCObjectWrapper::wrap(peft_model_id);
 }
+#endif
 
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_,
                                       flexflow_sgd_optimizer_t optimizer_) {
@@ -1617,6 +1623,7 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) {
   handle->set_transformer_layer_id(id);
 }
 
+#ifdef FF_BUILD_INFERENCE
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
                              enum RequestType *request_types,
@@ -1697,6 +1704,7 @@ void flexflow_model_generate(flexflow_model_t handle_,
     }
   }
 }
+#endif
 
 void flexflow_model_set_position_offset(flexflow_model_t handle_,
                                         int const offset) {
@@ -2584,6 +2592,8 @@ void flexflow_perform_registration(void) {
                                          true /*global*/);
 }
 
+#ifdef FF_BUILD_INFERENCE
+
 // -----------------------------------------------------------------------
 // BatchConfig
 // -----------------------------------------------------------------------
@@ -3052,3 +3062,5 @@ void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) {
   DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id);
   delete peft_model_id;
 }
+
+#endif
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
index bf4c23cad0..a7aee338e4 100644
--- a/src/ops/beam_topk.cu
+++ b/src/ops/beam_topk.cu
@@ -15,7 +15,7 @@
 
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/beam_topk.h"
-#include "flexflow/request_manager.h"
+// #include "flexflow/request_manager.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index ceb9277b76..5213633e73 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -71,7 +71,9 @@
 #include "flexflow/parallel_ops/partition.h"
 #include "flexflow/parallel_ops/reduction.h"
 #include "flexflow/parallel_ops/replicate.h"
+#ifdef FF_BUILD_INFERENCE
 #include "flexflow/request_manager.h"
+#endif
 #include "flexflow/substitution.h"
 #include "flexflow/utils/random_utils.h"
 #include "flexflow/utils/test_utils.h"
@@ -4684,6 +4686,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+#ifdef FF_BUILD_INFERENCE
   // RequestManager load_tokens
   {
     TaskVariantRegistrar registrar(RM_LOAD_TOKENS_TASK_ID,
@@ -4837,6 +4840,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+#endif
   // ElementUnary task
   {
     TaskVariantRegistrar registrar(ELEMENTUNARY_INIT_TASK_ID,

From ca3dabf7d23cf2173fca830249c4cb9eeb6171bf Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sat, 5 Oct 2024 11:36:34 -0700
Subject: [PATCH 3/3] [AllReduce] make AllReduce tasks concurrent in FlexFlow
 (#1517)

* minor bug fix

* make AllReduce tasks concurrent

* set concurrent=true for remaining operators

---------

Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>
---
 src/ops/fused.cc                      |  6 ++++++
 src/ops/lora_linear.cc                |  2 ++
 src/parallel_ops/allreduce.cc         |  5 +++++
 src/parallel_ops/parallel_identity.cc |  4 ++++
 src/runtime/model.cc                  | 23 +++++++++++++++++++++++
 5 files changed, 40 insertions(+)

diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 121139beb1..720d678a4a 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -476,6 +476,7 @@ void FusedOp::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
@@ -570,6 +571,7 @@ void FusedOp::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
@@ -604,6 +606,7 @@ void FusedOp::forward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
     assert(inputs[i]->part != LogicalPartition::NO_PART);
@@ -659,6 +662,7 @@ FutureMap FusedOp::inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
@@ -735,6 +739,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
@@ -787,6 +792,7 @@ void FusedOp::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   int idx = 0;
   for (int i = 0; i < numInputs; i++) {
     launcher.add_region_requirement(RegionRequirement(inputs[i]->part,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index fde6bc2b28..513147f3b7 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -296,6 +296,7 @@ void LoraLinear::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -795,6 +796,7 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 52c4ec2e28..dc43d80133 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -131,6 +131,7 @@ void AllReduce::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -164,6 +165,7 @@ void AllReduce::forward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -212,6 +214,7 @@ void AllReduce::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          inputs[0]->machine_view.hash());
+  // launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
@@ -265,6 +268,7 @@ void AllReduce::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -306,6 +310,7 @@ FutureMap AllReduce::inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc
index 883910ae09..7d68036709 100644
--- a/src/parallel_ops/parallel_identity.cc
+++ b/src/parallel_ops/parallel_identity.cc
@@ -133,6 +133,7 @@ void ParallelIdentity::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -214,6 +215,7 @@ void ParallelIdentity::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          inputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
@@ -268,6 +270,7 @@ void ParallelIdentity::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -381,6 +384,7 @@ FutureMap
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 5213633e73..52f1dd2220 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6888,6 +6888,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, LoraLinear::init_task>(
           registrar, "LoraLinear Init Task");
@@ -6919,6 +6920,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "LoraLinear PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<LoraLinear::peft_bwd_task>(
           registrar, "LoraLinear PEFT Backward Task");
@@ -6950,6 +6952,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_INIT_TASK_ID, "FusedOp Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, FusedOp::init_task>(
           registrar, "FusedOp Init Task");
@@ -6964,6 +6967,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::inference_task>(
           registrar, "FusedOp Inference Task");
@@ -6979,6 +6983,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "FusedOp PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::peft_bwd_task>(
           registrar, "FusedOp PEFT Backward Task");
@@ -6994,6 +6999,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::forward_task>(
           registrar, "FusedOp Forward Task");
@@ -7008,6 +7014,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_BWD_TASK_ID, "FusedOp Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::backward_task>(
           registrar, "FusedOp Backward Task");
@@ -7244,6 +7251,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, AllReduce::init_task>(
           registrar, "AllReduce init Task");
@@ -7258,6 +7266,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrently since they
+    // use ncclAllReduce internally
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::forward_task>(
           registrar, "AllReduce Forward Task");
@@ -7272,6 +7283,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrently since they
+    // use ncclAllReduce internally
+    // registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::backward_task>(
           registrar, "AllReduce Backward Task");
@@ -7287,6 +7301,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "AllReduce Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrently since they
+    // use ncclAllReduce internally
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::inference_task>(
           registrar, "AllReduce Inference Task");
@@ -7302,6 +7319,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "AllReduce PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrently since they
+    // use ncclAllReduce internally
+    // registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::peft_bwd_task>(
           registrar, "AllReduce PEFT Backward Task");
@@ -7318,6 +7338,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "ParallelIdentity Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, ParallelIdentity::init_task>(
           registrar, "ParallelIdentity init Task");
@@ -7349,6 +7370,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "ParallelIdentity Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<ParallelIdentity::backward_task>(
           registrar, "ParallelIdentity Backward Task");
@@ -7381,6 +7403,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "ParallelIdentity PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<ParallelIdentity::peft_bwd_task>(
           registrar, "ParallelIdentity PEFT Backward Task");