From c7da17f1e43c111bf4704bad232d0a5d7c55ceaa Mon Sep 17 00:00:00 2001
From: Liqiang Lu <liqiangxl@gmail.com>
Date: Fri, 21 Apr 2023 05:52:08 -0700
Subject: [PATCH 1/4] patch from old repo

---
 CMakeLists.txt                                |   1 +
 csrc/executor_utils.cpp                       |  25 +-
 csrc/ir_internal_nodes.h                      |   4 +
 csrc/kernel_cache.cpp                         |   3 +-
 csrc/kernel_cache.h                           |   5 +
 csrc/maxinfo_propagator.h                     |  20 +
 csrc/scheduler/normalization.cpp              | 713 ++++++++++++-
 csrc/scheduler/normalization.h                |   3 +
 csrc/scheduler/normalization_utils.cpp        | 162 ++-
 csrc/scheduler/normalization_utils.h          |  30 +-
 csrc/scheduler/reduction.cpp                  |  11 +-
 csrc/scheduler/reduction_heuristic.h          |  26 +-
 csrc/scheduler/reduction_utils.cpp            | 150 ++-
 csrc/scheduler/reduction_utils.h              |  36 +-
 csrc/scheduler/registry.cpp                   | 249 ++++-
 csrc/scheduler/utils.h                        |  27 +-
 csrc/utils.cpp                                |  45 +-
 csrc/utils.h                                  |   9 +-
 ...est_gpu_combined_inner_outer_reduction.cpp | 957 ++++++++++++++++++
 19 files changed, 2300 insertions(+), 176 deletions(-)
 create mode 100644 test/test_gpu_combined_inner_outer_reduction.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3ee1fa8905..53195ef3985 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -357,6 +357,7 @@ if(BUILD_TEST)
     ${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp
     ${NVFUSER_ROOT}/test/test_gpu_multidevice.cpp
     ${NVFUSER_ROOT}/test/test_multicluster_fusion.cpp
+    ${NVFUSER_ROOT}/test/test_gpu_combined_inner_outer_reduction.cpp
   )
   list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)
 
diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp
index 255daa63c2a..49347f25294 100644
--- a/csrc/executor_utils.cpp
+++ b/csrc/executor_utils.cpp
@@ -998,26 +998,11 @@ c10::optional<int> getMaxRegCount(
   // If the block size is known, set the maximum that at least allows
   // one block to be resident on an SM
   if (opt_block_size.has_value() && opt_block_size.value() > 0) {
-    int num_partition = 0;
-    int reg_allocation_granularity = 0;
-    const auto prop = at::cuda::getCurrentDeviceProperties();
-    cudaOccDeviceProp occ_prop(*prop);
-    cudaOccSubPartitionsPerMultiprocessor(&num_partition, &occ_prop);
-    cudaOccRegAllocationGranularity(&reg_allocation_granularity, &occ_prop);
-    int warp_size = prop->warpSize;
-    int64_t num_warps = ceilDiv(opt_block_size.value(), warp_size);
-
-    // warps could be distributed unevenly across partition
-    int64_t max_warps_per_sm_partition = ceilDiv(num_warps, num_partition);
-    // registers are evenly distributed across partitions, partition with most
-    // wraps determins the maximum register available per warp
-    int max_reg_per_warp =
-        prop->regsPerBlock / num_partition / (int)max_warps_per_sm_partition;
-    // clamp down to register allocation granularity at warp level
-    int effective_max_reg_per_warp = max_reg_per_warp /
-        reg_allocation_granularity * reg_allocation_granularity;
-    max_register =
-        std::min(max_register_limit, effective_max_reg_per_warp / warp_size);
+    constexpr int block_per_sm = 1;
+    max_register = std::min(
+        max_register_limit,
+        (int)getRegPerThreadGivenThreadsPerSM(
+            opt_block_size.value() * block_per_sm));
   }
 
   // If a heuristic value is given, i.e., max_register_heuristic is
diff --git a/csrc/ir_internal_nodes.h b/csrc/ir_internal_nodes.h
index 32f8c3e0a3e..e69ddc0ede8 100644
--- a/csrc/ir_internal_nodes.h
+++ b/csrc/ir_internal_nodes.h
@@ -1498,6 +1498,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
     return getIterType() == IterType::Reduction;
   }
 
+  bool isIteration() const {
+    return getIterType() == IterType::Iteration;
+  }
+
   bool isRFactorProduct() const {
     return is_rfactor_domain_;
   }
diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
index 4bb7d47797c..c21e4a119a2 100644
--- a/csrc/kernel_cache.cpp
+++ b/csrc/kernel_cache.cpp
@@ -466,7 +466,8 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
     most_recent_executor_log_.params = scheduler_entry->params()->clone();
   }
 
-  if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
+  if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose) ||
+      measure_kernel_time_) {
     executor.setMeasureKernelTimeFlag(true);
   }
 
diff --git a/csrc/kernel_cache.h b/csrc/kernel_cache.h
index ee1484d0dd7..045d2ec5874 100644
--- a/csrc/kernel_cache.h
+++ b/csrc/kernel_cache.h
@@ -99,6 +99,10 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
     profiling_ = to_profile;
   }
 
+  void setMeasureKernelTime(bool val = true) {
+    measure_kernel_time_ = val;
+  }
+
   //! Internal knob for profiling shape inference
   void disableLaunchParamCache() {
     for (auto& executor : executors_) {
@@ -230,6 +234,7 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
 
   // States for profiling support
   bool profiling_ = false;
+  bool measure_kernel_time_ = false;
 
   std::mutex mutex_;
 
diff --git a/csrc/maxinfo_propagator.h b/csrc/maxinfo_propagator.h
index 63d6808649c..780b6a7d087 100644
--- a/csrc/maxinfo_propagator.h
+++ b/csrc/maxinfo_propagator.h
@@ -280,4 +280,24 @@ class TORCH_CUDA_CU_API SetSelector : public MaxInfoSpanningTree::Selector {
   }
 };
 
+// Simple selector to allow different parallel patterns in the fusion.
+// The propagation is blocked at boundaryNodesSet.
+// For P2C forward propagate, disable propagation to tensorViews in
+// boundaryNodesSet. For C2P backward propagate, disable propagation from
+// tensorViews in boundaryNodesSet
+struct InternalBoundarySelector : public MaxInfoSpanningTree::Selector {
+  std::unordered_set<TensorView*> tvs_;
+  virtual bool allowC2P(TensorView* from, TensorView* to) override {
+    return tvs_.count(from) == 0;
+  };
+  virtual bool allowP2C(TensorView* from, TensorView* to) override {
+    return tvs_.count(to) == 0;
+  };
+  virtual bool allowSibling(TensorView* from, TensorView* to) override {
+    return true;
+  }
+  InternalBoundarySelector(const std::unordered_set<TensorView*>& tvs)
+      : tvs_(tvs) {}
+};
+
 } // namespace nvfuser
diff --git a/csrc/scheduler/normalization.cpp b/csrc/scheduler/normalization.cpp
index fc8ed02e13f..62092d77465 100644
--- a/csrc/scheduler/normalization.cpp
+++ b/csrc/scheduler/normalization.cpp
@@ -9,6 +9,7 @@
 
 #include <executor_utils.h>
 #include <grouped_reduction.h>
+#include <inlining.h>
 #include <instrumentation.h>
 #include <ir_all_nodes.h>
 #include <ir_iostream.h>
@@ -28,17 +29,271 @@ namespace nvfuser {
 
 namespace {
 
-// round up to multiple of 8 or pow2 whichever smaller
-int64_t roundUpPow2Or8(const int64_t x) {
-  auto round_up_pow2 = scheduler_utils::lastPow2(x);
-  if (round_up_pow2 < x) {
-    round_up_pow2 *= 2;
+// The innerOuterPersistentHeuristic is tuned for layer_norm backward on A100
+// ======= Method if hidden_size > 1024 =======
+// (1) Inner reduction is one reduction per block. Reduction domain is
+// parallelized by TIDx and TIDy, Iteration domain is parallelized by BIDy. (2)
+// Outer reduction is done in two-steps. The first step is partial reduction,
+// reduction domain is parallelized by BIDy, iteration domain is parallelized by
+// TIDx and TIDy. The partial results are written to gmem followed by a grid
+// sync. The second step is block reduction, the reduction domain is
+// parallelized by TIDy, the iteration domain is parallelized by TIDx and BIDy.
+// ======= Method if hidden_size <= 1024 =======
+// (1) Inner reduction is multi-reductions per blocks. Reduction domain is
+// parallelized by TIDx, Iteration domain is parallelized by BIDy and TIDy
+// (2) Outer reduction is same to cases where hidden_size > 1024 except the
+// second step where in this case, the reduction domain is parallelized by TIDx
+// and the iteration domain is parallelized by TIDy and BIDy. This switch
+// between TIDx and TIDy is because (a) We can do warp reduction with TIDx and
+// (b) TIDx*BIDy is usually much larger than hidden_size, e.g. 128*216 = 1024*27
+// this means without switch only 1/27 of the threads is used.
+std::shared_ptr<ReductionParams> innerOuterPersistentHeuristic(
+    const int64_t outer_dim_numel,
+    const int64_t inner_dim_numel,
+    const int64_t max_persistent_buffer_size,
+    const size_t tmp_gmem_dtype_size,
+    const size_t vectorize_factor) {
+  auto rparams = std::make_shared<ReductionParams>();
+  // Parameters for inner reduction:
+  // Reduction dim: inner_vect, inner_batch, bdimx and bdimy
+  // Iteration dim: gdimy
+
+  // Parameters for outer reduction:
+  // Reduction dim: bdimy
+  // Iteration dim: vectorization_factor_outer, bdimx, gdimy
+  struct InnerOuterParams {
+    int64_t inner_vect = -1;
+    int64_t inner_batch = -1;
+    int64_t bdimx = -1;
+    int64_t bdimy = -1;
+    int64_t gdimy = -1;
+    int64_t tmp_gmem_write_vect = -1;
+    int64_t vectorization_factor_outer = -1;
+
+    void verify() {
+      TORCH_INTERNAL_ASSERT(inner_vect != -1, "inner_vect is not set.");
+      TORCH_INTERNAL_ASSERT(inner_batch != -1, "inner_batch is not set.");
+      TORCH_INTERNAL_ASSERT(bdimx != -1, "bdimx is not set.");
+      TORCH_INTERNAL_ASSERT(bdimy != -1, "bdimy is not set.");
+      TORCH_INTERNAL_ASSERT(gdimy != -1, "gdimy is not set.");
+      TORCH_INTERNAL_ASSERT(
+          tmp_gmem_write_vect != -1, "tmp_gmem_write_vect is not set.");
+      TORCH_INTERNAL_ASSERT(
+          vectorization_factor_outer != -1,
+          "vectorization_factor_outer is not set.");
+    }
+  };
+
+  InnerOuterParams iop;
+
+  // Set a minimum workload for each thread to take advantage of low
+  // intra-threads communication cost. Tuned for layer_norm backward on A100.
+  auto getMinimumBatch = [&]() {
+    int batch_min;
+    if (inner_dim_numel >= 3072) {
+      if (outer_dim_numel <= 2048 && inner_dim_numel == 3072) {
+        batch_min = 3;
+      } else {
+        batch_min = 4;
+      }
+    } else if (inner_dim_numel >= 2048) {
+      batch_min = 2;
+    } else {
+      batch_min = 1;
+    }
+    return batch_min;
+  };
+
+  // Estimate register per thread based on buffer size, since inner reduction
+  // dim is fully parallelized, the buffer size of each thread equals the total
+  // buffer size divide by inner_dim_numel.
+  auto getEstimatedRegisterUsage = [&](int64_t batch_mul_vect) {
+    constexpr int64_t overhead_register = 40;
+    constexpr int64_t bytes_per_register = 4;
+    const int64_t persistent_buffer_size =
+        max_persistent_buffer_size / inner_dim_numel * batch_mul_vect;
+    const int64_t estimated_register_count =
+        persistent_buffer_size / bytes_per_register + overhead_register;
+    return std::min(estimated_register_count, (int64_t)255);
+  };
+
+  auto getBlocksPerSM = [&](const int64_t threads_per_sm,
+                            const int64_t threads_per_block,
+                            const int64_t warp_size) {
+    constexpr int64_t warp_allocation_granularity = 4;
+    const int64_t allocated_warps_per_block =
+        ceilDiv(
+            ceilDiv(threads_per_block, warp_size),
+            warp_allocation_granularity) *
+        warp_allocation_granularity;
+    return scheduler_utils::safeDiv(
+        threads_per_sm / warp_size, allocated_warps_per_block);
+  };
+
+  const auto dev_prop = at::cuda::getCurrentDeviceProperties();
+  const int64_t device_multiprocessor_count =
+      (int64_t)dev_prop->multiProcessorCount;
+
+  // Step-1, set InnerParams reduction dim: inner_vect, inner_batch,
+  // threads_per_block (bdimx * bdimy). Start threads_per_block from a quarter
+  // warp, gradually increase it. Runtime checkCombinedReductionShape ensures
+  // inner_dim_numel is dividable by the multiplication of a quarter warp and
+  // vectorize_factor.
+  int64_t threads_per_block = dev_prop->warpSize / 4;
+  iop.inner_vect = vectorize_factor;
+  iop.inner_batch = inner_dim_numel / iop.inner_vect / threads_per_block;
+  TORCH_INTERNAL_ASSERT(
+      iop.inner_vect * iop.inner_batch * threads_per_block == inner_dim_numel,
+      " inner_dim_numel must be dividable by the multiplication of a quarter warp and vectorize_factor");
+  const int64_t threads_per_block_max = inner_dim_numel >= 20480 ? 512 : 256;
+  const int64_t batch_min = getMinimumBatch();
+  auto tryReduceBatch = [&](auto factor) -> bool {
+    return iop.inner_batch % factor == 0 &&
+        iop.inner_batch / factor >= batch_min &&
+        threads_per_block * factor <= threads_per_block_max;
+  };
+  while (iop.inner_batch > batch_min &&
+         threads_per_block < threads_per_block_max) {
+    bool modified = false;
+    for (auto factor : {2, 3, 5}) {
+      if (tryReduceBatch(factor)) {
+        iop.inner_batch /= factor;
+        threads_per_block *= factor;
+        modified = true;
+        break;
+      }
+    }
+    if (!modified) {
+      break;
+    }
   }
-  constexpr int64_t kEight = 8; // clang tidy
-  auto round_up_8 = x % kEight == 0 ? x : x + (kEight - x % kEight);
-  return std::min(round_up_8, round_up_pow2);
-}
 
+  // Step-2, set InnerParams Iteration dim: gdimy. reg_per_thread is estimated
+  // from buffer size, then it is used to calculate threads_per_sm and gdimy.
+  // gdimy_max ensures each block processes at least 8 rows to
+  // reduce the workload of the final outer reduction.
+  int64_t reg_per_thread =
+      getEstimatedRegisterUsage(iop.inner_vect * iop.inner_batch);
+  int64_t threads_per_sm = getThreadsPerSMGivenRegPerThread(reg_per_thread);
+  int64_t blocks_per_sm =
+      getBlocksPerSM(threads_per_sm, threads_per_block, dev_prop->warpSize);
+  iop.gdimy = blocks_per_sm * device_multiprocessor_count;
+  const int64_t outer_iter_min = 8;
+  const int64_t gdimy_max = scheduler_utils::roundUpToN(
+      ceilDiv(outer_dim_numel, outer_iter_min), device_multiprocessor_count);
+  while (iop.gdimy > gdimy_max && blocks_per_sm > 1) {
+    blocks_per_sm -= 1;
+    iop.gdimy = blocks_per_sm * device_multiprocessor_count;
+  }
+
+  // set the vectorization factor for the write to tmp gmem, may be different
+  // from inner_vect due to different data types, e.g. input is half and
+  // tmp_gmem is float
+  constexpr int64_t max_gmem_vect_access_bytes = 16;
+  const int64_t max_tmp_gmem_vect_factor =
+      max_gmem_vect_access_bytes / tmp_gmem_dtype_size;
+  iop.tmp_gmem_write_vect = std::min(max_tmp_gmem_vect_factor, iop.inner_vect);
+
+  // Step-3, set OuterParams Iteration dim: vectorization_factor_outer, bdimx,
+  // gdimy (already done) The partial outer reduction result is stored in tmp
+  // gmem, set the vectorization factor for write and read
+  const int64_t workload_per_thread = inner_dim_numel >= 4096 ? 4 : 2;
+  iop.vectorization_factor_outer =
+      std::min(workload_per_thread, max_tmp_gmem_vect_factor);
+  iop.bdimx = scheduler_utils::roundUpPow2(
+      ceilDiv(inner_dim_numel / iop.vectorization_factor_outer, iop.gdimy));
+
+  // Step-4, set OuterParams Reduction dim: bdimy.
+  iop.bdimy = ceilDiv(threads_per_block, iop.bdimx);
+
+  // Step-5, special case, when inner_dim_numel <= 1024, bdimx is usually small
+  // after divide by inner_vect and inner_batch. In this case, bdimy is used to
+  // parallelize outer_dim instead of inner_dim. This pattern is named multi
+  // reductions per block (mrpb).
+  if (inner_dim_numel <= 1024) {
+    rparams->multiple_reds_per_blk = true;
+    rparams->tidx_for_outer_reduction = true;
+    constexpr int64_t threads_per_block_mrpb = 512;
+
+    // Step-1, InnerParams, Reduction dim: inner_vect(reuse), inner_batch, bdimx
+    iop.inner_batch = 1;
+    iop.bdimx = inner_dim_numel / iop.inner_vect;
+
+    // Step-2, InnerParams, Iteration dim: gdimy, bdimy (in next step)
+    reg_per_thread =
+        getEstimatedRegisterUsage(iop.inner_vect * iop.inner_batch);
+    threads_per_sm = getThreadsPerSMGivenRegPerThread(reg_per_thread);
+    blocks_per_sm = getBlocksPerSM(
+        threads_per_sm, threads_per_block_mrpb, dev_prop->warpSize);
+    iop.gdimy = blocks_per_sm * device_multiprocessor_count;
+
+    // Step-3, OuterParams, Iteration dim: vectorization_factor_outer(reuse),
+    // bdimy, gdimy (in previous step). vectorization_factor_outer is set to 2
+    // as a small workload per thread is preferred for small sizes and we only
+    // process vectorized cases.
+    iop.bdimy = std::min(
+        ceilDiv(inner_dim_numel / iop.vectorization_factor_outer, iop.gdimy),
+        scheduler_utils::safeDiv(threads_per_block_mrpb, iop.bdimx));
+    iop.bdimy = iop.bdimy;
+
+    // Step-4, OuterParams, Reduction dim: bdimx (already done)
+
+    if (iop.bdimx % dev_prop->warpSize == 0) {
+      rparams->pad_inner_reduction_to_warp = true;
+      rparams->pad_outer_reduction_to_warp = true;
+    }
+    rparams->block_dim_iter_dom = ParallelType::TIDy;
+  } else {
+    rparams->block_dim_inner_reduction_extra = ParallelType::TIDy;
+  }
+
+  // check all the parameters in InnerOuterParams are set.
+  iop.verify();
+
+  rparams->persistent_kernel = true;
+  rparams->fastest_dim = true;
+  rparams->combined_inner_outer = true;
+  // tmp_gmem is the intermediate result of outer reduction, its dtype is float,
+  // so the maximum vectorization factor is 4.
+  rparams->vectorization_factor_outer = iop.vectorization_factor_outer;
+  rparams->vectorization_factor_tmp_gmem_write = iop.tmp_gmem_write_vect;
+  rparams->cparams.maxrregcount =
+      getRegPerThreadGivenThreadsPerSM(iop.bdimx * iop.bdimy * blocks_per_sm);
+  rparams->unroll_factor_inner_reduction = iop.inner_vect;
+  rparams->batches_per_block_inner_reduction = iop.inner_batch;
+  rparams->block_dim_inner_reduction = ParallelType::TIDx;
+  rparams->vectorize_inner_reduction = iop.inner_vect > 1;
+  rparams->split_grid_dim_iter_dom_outer = true;
+  rparams->grid_dim_iter_dom = ParallelType::BIDy;
+  rparams->lparams = LaunchParams(
+      LaunchParams::UNINITIALIZED_VAL,
+      iop.gdimy,
+      LaunchParams::UNINITIALIZED_VAL,
+      iop.bdimx,
+      iop.bdimy,
+      LaunchParams::UNINITIALIZED_VAL);
+
+  rparams->tag = "InnerOuter Persistent Heuristic.\n";
+
+  if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
+    std::cerr << "\n===== Combined InnerOuter Reduction Stats ========\n"
+              << "outer_dim_numel: " << outer_dim_numel << "\n"
+              << "inner_dim_numel: " << inner_dim_numel << "\n"
+              << "vectorize_factor_input: " << iop.inner_vect << "\n"
+              << "vectorization_factor_tmp_gmem_write: "
+              << iop.tmp_gmem_write_vect << "\n"
+              << "vectorization_factor_outer: "
+              << iop.vectorization_factor_outer << "\n"
+              << "multiple_reds_per_blk: " << rparams->multiple_reds_per_blk
+              << "\n"
+              << "threads_per_sm: " << threads_per_sm << "\n"
+              << "gdimy: " << iop.gdimy << "\n"
+              << "block(" << (iop.bdimx) << ", " << iop.bdimy << ", " << 1
+              << ")";
+    std::cerr << rparams->toString() << std::endl;
+  }
+  return rparams;
+}
 // Copied from reduction scheduler, should generalize. Simply needed to take out
 // grid reductions.
 std::shared_ptr<ReductionParams> innerPersistentHeuristic(
@@ -297,21 +552,22 @@ std::shared_ptr<ReductionParams> innerPersistentHeuristic(
   while (!vectorize && inner_reduction_unroll_factor < max_unroll &&
          batches_per_block_inner_reduction >= 2) {
     inner_reduction_unroll_factor *= 2;
-    batches_per_block_inner_reduction = roundUpPow2Or8(ceilDiv(
+    batches_per_block_inner_reduction = scheduler_utils::roundUpPow2Or8(ceilDiv(
         inner_most_dimension_numel, bdimx * inner_reduction_unroll_factor));
   }
 
   // Set size of persistent per thread buffer on outer reduction buffer
-  int64_t batches_per_block_outer_reduction = roundUpPow2Or8(ceilDiv(
-      ceilDiv(total_reduction_numel, inner_most_dimension_numel),
-      bdimz * outer_reduction_unroll_factor));
+  int64_t batches_per_block_outer_reduction =
+      scheduler_utils::roundUpPow2Or8(ceilDiv(
+          ceilDiv(total_reduction_numel, inner_most_dimension_numel),
+          bdimz * outer_reduction_unroll_factor));
 
   // Prefer putting iterations into unrolling over having a very large
   // persistent buffer.
   while (outer_reduction_unroll_factor < max_unroll &&
          batches_per_block_outer_reduction >= 2) {
     outer_reduction_unroll_factor *= 2;
-    batches_per_block_outer_reduction = roundUpPow2Or8(
+    batches_per_block_outer_reduction = scheduler_utils::roundUpPow2Or8(
         ceilDiv(outer_reduction_numel, bdimz * outer_reduction_unroll_factor));
   }
 
@@ -372,10 +628,11 @@ std::shared_ptr<ReductionParams> innerPersistentHeuristic(
     // reduction
     if (batches_per_block_outer_reduction >= 2 &&
         batches_per_block_outer_reduction !=
-            roundUpPow2Or8(batches_per_block_outer_reduction / 2) &&
+            scheduler_utils::roundUpPow2Or8(
+                batches_per_block_outer_reduction / 2) &&
         bdimz * 2 <= scheduler_utils::z_block_limit) {
-      batches_per_block_outer_reduction =
-          roundUpPow2Or8(batches_per_block_outer_reduction / 2);
+      batches_per_block_outer_reduction = scheduler_utils::roundUpPow2Or8(
+          batches_per_block_outer_reduction / 2);
       bdimz = ceilDiv(
           outer_reduction_numel,
           batches_per_block_outer_reduction * outer_reduction_unroll_factor);
@@ -450,9 +707,9 @@ std::shared_ptr<ReductionParams> innerPersistentHeuristic(
             estimated_register_count * device_warp_size,
             reg_allocation_granularity) *
         reg_allocation_granularity;
-    const int threadsPerBlock =
+    const int threads_per_block =
         (pad_bdimx ? padded_bdimx : bdimx) * bdimy * bdimz;
-    const int warps_per_block = ceilDiv(threadsPerBlock, dev_prop->warpSize);
+    const int warps_per_block = ceilDiv(threads_per_block, dev_prop->warpSize);
     const int estimated_warps_per_sm = dev_prop->regsPerMultiprocessor /
         (register_per_warp * warps_per_block) * warps_per_block;
     const int occupancy_warps_per_sm = static_cast<int>(
@@ -822,7 +1079,7 @@ std::shared_ptr<ReductionParams> outerPersistentHeuristic(
   int64_t batches_per_block =
       ceilDiv(total_reduction_numel, bdimy * inner_reduction_unroll_factor);
 
-  batches_per_block = roundUpPow2Or8(batches_per_block);
+  batches_per_block = scheduler_utils::roundUpPow2Or8(batches_per_block);
 
   // Adjust bdimy based on batches_per_block and unroll factor set
   bdimy = ceilDiv(
@@ -837,8 +1094,9 @@ std::shared_ptr<ReductionParams> outerPersistentHeuristic(
       // And batches_per_block can be divided by two
       batches_per_block >= 2 &&
       // Make sure batches_per_block will be updated
-      batches_per_block != roundUpPow2Or8(batches_per_block / 2)) {
-    batches_per_block = roundUpPow2Or8(batches_per_block / 2);
+      batches_per_block !=
+          scheduler_utils::roundUpPow2Or8(batches_per_block / 2)) {
+    batches_per_block = scheduler_utils::roundUpPow2Or8(batches_per_block / 2);
 
     // Adjust bdimy based on batches_per_block and unroll factor set
     bdimy = ceilDiv(
@@ -936,11 +1194,22 @@ std::shared_ptr<ReductionParams> persistentHeuristic(
     const bool fastest_dim_reduction,
     const size_t n_tensor_inputs,
     const size_t max_input_dtype_size,
+    const size_t tmp_gmem_dtype_size,
     const int64_t max_persistent_buffer_size,
     size_t vectorize_factor,
-    bool project_persistent_buffers) {
+    bool project_persistent_buffers,
+    const bool combined_inner_outer_reduction) {
   std::shared_ptr<ReductionParams> rparams;
-  if (fastest_dim_reduction) {
+  if (combined_inner_outer_reduction) {
+    const int64_t outer_dim_numel = total_iteration_numel;
+    const int64_t inner_dim_numel = inner_most_dimension_numel;
+    rparams = innerOuterPersistentHeuristic(
+        outer_dim_numel,
+        inner_dim_numel,
+        max_persistent_buffer_size,
+        tmp_gmem_dtype_size,
+        vectorize_factor);
+  } else if (fastest_dim_reduction) {
     rparams = innerPersistentHeuristic(
         total_reduction_numel,
         total_iteration_numel,
@@ -967,7 +1236,6 @@ std::shared_ptr<ReductionParams> getPersistentHeuristics(
     SchedulerRuntimeInfo& runtime_info,
     HeuristicSummary* data_cache) {
   FUSER_PERF_SCOPE("getPersistentHeuristics");
-
   FusionGuard fg(fusion);
 
   auto reduction_tv_entry =
@@ -1000,6 +1268,20 @@ std::shared_ptr<ReductionParams> getPersistentHeuristics(
       std::distance(tv_inps.begin(), tv_inps.end()) > 0,
       "Tried to schedule a fusion with no tensor inputs, currently not supported.");
 
+  int64_t n_tensor_inner_reduction = 0;
+  int64_t n_tensor_outer_reduction = 0;
+  std::vector<TensorView*> outer_reduction_tvs;
+  for (auto tv : reduction_tvs) {
+    if (scheduler_utils::isFastestDimReduction(tv)) {
+      n_tensor_inner_reduction++;
+    } else {
+      n_tensor_outer_reduction++;
+      outer_reduction_tvs.emplace_back(tv);
+    }
+  }
+  const bool combined_inner_outer_reduction =
+      n_tensor_inner_reduction && n_tensor_outer_reduction;
+
   auto persistent_buffer_info_entry =
       HeuristicSummaryEntry<HeuristicCompileTime::PersistentBufferInfo>(
           data_cache, [&fusion]() {
@@ -1046,6 +1328,45 @@ std::shared_ptr<ReductionParams> getPersistentHeuristics(
       persistent_buffer_size_info.projected_persistent_buffer_size <
       persistent_buffer_size_info.persistent_buffer_size;
 
+  if (combined_inner_outer_reduction) {
+    // In combined_inner_outer_reduction, we have additional buffers for partial
+    // results of outer reductions.
+    int64_t outer_reduction_buffer_size =
+        normalization_scheduler_utils::partialReductionBufferSize(
+            outer_reduction_tvs, runtime_info);
+
+    // for layer_norm backward, enable project to input can reuse weight shared
+    // among different rows. Although it increased register usage and may lead
+    // to register spills, the overall performance is increased. The following
+    // code will check if we can do this projection by allowing more registers.
+    // This is a temporary solution, the issue is tracked by
+    // https://github.com/csarofeen/pytorch/issues/2525
+    if (!project_persistent_buffers) {
+      int64_t total_projected_buffer_size =
+          persistent_buffer_size_info.projected_persistent_buffer_size +
+          outer_reduction_buffer_size;
+      // allow 10% more to allow project to input, 14K float should do project
+      // and 16K float should't do. more_register_factor >= 14*1024*5(three
+      // inputs, two outer reduction results)*sizeof(float) /
+      // register_file_size_full
+      constexpr float more_register_factor = 1.1;
+      const int64_t avilable_register_file_size =
+          scheduler_utils::register_file_size_full * more_register_factor;
+      if (avilable_register_file_size >= total_projected_buffer_size) {
+        project_persistent_buffers = true;
+      }
+    }
+    // now we have the final decision on whether we project to input or not.
+    if (project_persistent_buffers) {
+      max_persistent_size =
+          persistent_buffer_size_info.projected_persistent_buffer_size +
+          outer_reduction_buffer_size;
+    } else {
+      max_persistent_size = persistent_buffer_size_info.persistent_buffer_size +
+          outer_reduction_buffer_size;
+    }
+  }
+
   auto unrollable_inputs_outputs_entry =
       HeuristicSummaryEntry<HeuristicCompileTime::UnrollableInputsAndOutputs>(
           data_cache, [&first_red_tv]() {
@@ -1081,6 +1402,11 @@ std::shared_ptr<ReductionParams> getPersistentHeuristics(
     n_tensor_inputs++;
   }
 
+  // dtype used to store partial outer reduction in combined reduction
+  const size_t tmp_gmem_dtype_size = combined_inner_outer_reduction
+      ? dataTypeSize(outer_reduction_tvs[0]->getDataType().value())
+      : dataTypeSize(first_red_tv->getDataType().value());
+
   // Protect heuristics div by 0:
   n_tensor_inputs = std::max(n_tensor_inputs, (size_t)1);
 
@@ -1091,9 +1417,11 @@ std::shared_ptr<ReductionParams> getPersistentHeuristics(
       properties.fastest_dim_reduction,
       n_tensor_inputs,
       max_dtype_size,
+      tmp_gmem_dtype_size,
       max_persistent_size,
       vectorize_factor,
-      project_persistent_buffers);
+      project_persistent_buffers,
+      combined_inner_outer_reduction);
   heuristic->cparams.index_type = runtime_info.getIndexType();
   return heuristic;
 }
@@ -1107,19 +1435,20 @@ std::shared_ptr<ReductionParams> getPersistentHeuristics(
   return getPersistentHeuristics(fusion, runtime_info, data_cache);
 }
 
-// fusion is the input IR that will be modified by this function
-void schedulePersistentKernel(Fusion* fusion, const ReductionParams& rparams) {
-  FUSER_PERF_SCOPE("schedulePersistentKernel");
-
-  FusionGuard fg(fusion);
-
+// common prepare for both inner outer combined and seperated reductions
+void beforeSchedule(
+    Fusion* fusion,
+    const ReductionParams& rparams,
+    std::vector<TensorView*>& dummy_outputs,
+    std::vector<TensorView*>& cached_inputs,
+    std::vector<TensorView*>& reduction_tvs,
+    std::vector<std::pair<TensorView*, TensorView*>>& cached_outputs) {
   // Project the persistent buffers to the inputs. Inputs will be cached in a
   // later step, this will move them to be in a register buffer as expected.
   // dummy outputs are helper tensors to make sure persistent buffer projection
   // does not create trouble for transform propagation.
   // TODO: Fix projected persistent buffers with view
   // https://github.com/csarofeen/pytorch/issues/2054
-  std::vector<TensorView*> dummy_outputs;
   if (rparams.project_persistent_buffers &&
       ir_utils::getViewOps(fusion).empty()) {
     dummy_outputs = reduction_scheduler_utils::projectPersistentBuffers(fusion);
@@ -1133,19 +1462,26 @@ void schedulePersistentKernel(Fusion* fusion, const ReductionParams& rparams) {
 
   // Cache inputs even if not unrolled, as otherwise we may not create a
   // persistent buffer if that persistent buffer would be the input.
-  auto cached_inputs = scheduler_utils::cacheInputs(fusion, true);
+  cached_inputs = scheduler_utils::cacheInputs(fusion, true);
 
   // Cache and fork outputs
-  auto cached_outputs = scheduler_utils::cacheAndForkOutputs(fusion, unroll);
+  cached_outputs = scheduler_utils::cacheAndForkOutputs(fusion, unroll);
 
   // Make sure we don't have global memory set on intermediate tensors from
   // fusion segmentation
   scheduler_utils::clearMemorySpace(fusion);
-
   scheduler_utils::prepareForMemoryTypePromotion(fusion);
+  reduction_tvs = scheduler_utils::getReductionTvs(fusion);
+}
 
-  auto reduction_tvs = scheduler_utils::getReductionTvs(fusion);
-
+// If called from schedulePersistentKernel, reduction_tvs are either inner
+// reductions or outer reductions. If called from
+// schedulePersistentKernelInnerOuter, reduction_tvs are inner reductions, outer
+// reductions are handled by scheduleCombinedOuter.
+TensorView* scheduleReductionGeneral(
+    Fusion* fusion,
+    const ReductionParams& rparams,
+    std::vector<TensorView*>& reduction_tvs) {
   TORCH_INTERNAL_ASSERT(reduction_tvs.size());
   // Registry assumes the reference tv is the first reduction_tv, if this
   // changes registry needs to change.
@@ -1163,7 +1499,8 @@ void schedulePersistentKernel(Fusion* fusion, const ReductionParams& rparams) {
   }
 
   if (rparams.persistent_kernel && rparams.cross_grid_inner_reduction &&
-      !rparams.fastest_dim && reduction_tvs.size() > 1) {
+      !rparams.fastest_dim && reduction_tvs.size() > 1 &&
+      !rparams.combined_inner_outer) {
     groupReductions(reduction_tvs, false);
   }
 
@@ -1182,24 +1519,55 @@ void schedulePersistentKernel(Fusion* fusion, const ReductionParams& rparams) {
         "If all dims are reduction, should be sending it to fastest dim scheduler.");
   }
 
-  TensorView* reference_tv = reduction_scheduler_utils::scheduleReductionTV(
+  return reduction_scheduler_utils::scheduleReductionTV(
       rparams, reduction_tv, has_iter_axis);
+}
+
+// fusion is the input IR that will be modified by this function
+void schedulePersistentKernel(Fusion* fusion, const ReductionParams& rparams) {
+  FUSER_PERF_SCOPE("schedulePersistentKernel");
+  if (rparams.combined_inner_outer) {
+    return schedulePersistentKernelInnerOuter(fusion, rparams);
+  }
+  FusionGuard fg(fusion);
+
+  // Grab the reduction, input, and output tensor views. dummy_outputs are
+  // helper tensors for persistent buffer projection.
+  std::vector<TensorView*> dummy_outputs, cached_inputs, reduction_tvs;
+  std::vector<std::pair<TensorView*, TensorView*>> cached_outputs;
+  beforeSchedule(
+      fusion,
+      rparams,
+      dummy_outputs,
+      cached_inputs,
+      reduction_tvs,
+      cached_outputs);
+
+  TensorView* reference_tv =
+      scheduleReductionGeneral(fusion, rparams, reduction_tvs);
 
   // Reduction tensor views and rfactor tensor views are setup. Let's finish off
   // the scheduling, particularly inlining and unrolling.
   TORCH_INTERNAL_ASSERT(
-      reference_tv != nullptr && reduction_tv != nullptr,
+      reference_tv != nullptr && reduction_tvs[0] != nullptr,
       "Need these two tensor views to finish the scheduling.");
 
   for (auto output : dummy_outputs) {
     fusion->addOutput(output);
   }
 
+  const bool unroll = rparams.isUnrolled();
+  const bool vectorize =
+      rparams.vectorize_inner_reduction || rparams.vectorize_iter_dom;
+  const bool is_outer_grid_persistence = rparams.persistent_kernel &&
+      rparams.cross_grid_inner_reduction && !rparams.fastest_dim;
   reduction_scheduler_utils::multiReductionInliner(
       fusion,
-      rparams,
-      reduction_tv,
+      reduction_tvs[0],
       reference_tv,
+      unroll,
+      vectorize,
+      is_outer_grid_persistence,
       reduction_tvs,
       cached_inputs,
       cached_outputs,
@@ -1218,4 +1586,263 @@ void schedulePersistentKernel(Fusion* fusion, const ReductionParams& rparams) {
       fusion, cached_inputs);
 }
 
+void scheduleReductionCombinedOuter(
+    Fusion* fusion,
+    const ReductionParams& rparams,
+    const std::vector<TensorView*>& outer_reduction_tvs,
+    std::vector<TensorView*>& cached_gmem,
+    std::vector<TensorView*>& cached_gmem_reload,
+    std::vector<TensorView*>& outer_reference_tvs,
+    std::unordered_set<TensorView*>& boundaryNodesSet) {
+  auto mergeReductionOrIterDomains = [](TensorView* tv, bool mergeReduction) {
+    int prev_i = -1;
+    for (int i = static_cast<int>(tv->nDims()) - 1; i >= 0; i--) {
+      if (mergeReduction == tv->axis(i)->isReduction()) {
+        if (prev_i == -1) {
+          prev_i = i;
+        } else {
+          tv->merge(i, prev_i);
+          prev_i = i;
+        }
+      }
+    }
+  };
+  for (auto& outer_reduction_tv : outer_reduction_tvs) {
+    // merge tensorview to [reduction, iteraiton] domains
+    mergeReductionOrIterDomains(outer_reduction_tv, true);
+    mergeReductionOrIterDomains(outer_reduction_tv, false);
+    if (rparams.multiple_reds_per_blk) {
+      outer_reduction_tv->split(
+          0, NamedScalar::getParallelDim(rparams.block_dim_iter_dom));
+    }
+    outer_reduction_tv->split(
+        0, NamedScalar::getParallelDim(rparams.grid_dim_iter_dom), false);
+
+    if (rparams.multiple_reds_per_blk) {
+      outer_reduction_tv->rFactor({1});
+    }
+    TensorView* partialResult = outer_reduction_tv->rFactor({1});
+    partialResult->cacheBefore();
+    partialResult->setMemoryType(MemoryType::Global);
+    TensorView* partialResultReload = partialResult->cacheAfter();
+
+    boundaryNodesSet.insert(partialResultReload);
+    cached_gmem.emplace_back(partialResult);
+    cached_gmem_reload.emplace_back(partialResultReload);
+
+    if (rparams.multiple_reds_per_blk) {
+      if (rparams.tidx_for_outer_reduction) {
+        outer_reduction_tv->split(
+            0, NamedScalar::getParallelDim(ParallelType::TIDx));
+        outer_reduction_tv->axis(1)->parallelize(ParallelType::TIDx);
+        // to use warp reduction
+        if (rparams.pad_outer_reduction_to_warp) {
+          outer_reduction_tv->axis(1)->padToMultipleOfWarp();
+        }
+      } else {
+        outer_reduction_tv->split(
+            0, NamedScalar::getParallelDim(ParallelType::TIDy));
+        outer_reduction_tv->axis(1)->parallelize(ParallelType::TIDy);
+      }
+      // iteration domain
+      int axisID = -1;
+      if (rparams.vectorization_factor_outer > 1) {
+        outer_reduction_tv->split(axisID, rparams.vectorization_factor_outer);
+        outer_reduction_tv->axis(axisID--)->parallelize(
+            ParallelType::Vectorize);
+      }
+
+      if (rparams.tidx_for_outer_reduction) {
+        outer_reduction_tv->split(
+            axisID, NamedScalar::getParallelDim(ParallelType::TIDy));
+        outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::TIDy);
+      } else {
+        outer_reduction_tv->split(
+            axisID, NamedScalar::getParallelDim(ParallelType::TIDx));
+        outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::TIDx);
+      }
+
+      outer_reduction_tv->split(
+          axisID, NamedScalar::getParallelDim(ParallelType::BIDy));
+      outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::BIDy);
+
+    } else {
+      // reduction domain
+      outer_reduction_tv->split(
+          0, NamedScalar::getParallelDim(ParallelType::TIDy));
+      outer_reduction_tv->axis(1)->parallelize(ParallelType::TIDy);
+
+      // iteration domain
+      int axisID = -1;
+      if (rparams.vectorization_factor_outer > 1) {
+        outer_reduction_tv->split(axisID, rparams.vectorization_factor_outer);
+        outer_reduction_tv->axis(axisID--)->parallelize(
+            ParallelType::Vectorize);
+      }
+
+      if (rparams.lparams.bdimx() > 1) {
+        outer_reduction_tv->split(
+            axisID, NamedScalar::getParallelDim(ParallelType::TIDx));
+        outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::TIDx);
+      }
+
+      outer_reduction_tv->split(
+          axisID, NamedScalar::getParallelDim(ParallelType::BIDy));
+
+      outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::BIDy);
+    }
+    auto outer_reference_tv =
+        reduction_scheduler_utils::sortAndRFactor(outer_reduction_tv);
+    outer_reference_tvs.emplace_back(outer_reference_tv);
+  }
+}
+
+void schedulePersistentKernelInnerOuter(
+    Fusion* fusion,
+    const ReductionParams& rparams) {
+  FUSER_PERF_SCOPE("schedulePersistentKernelInnerOuter");
+
+  FusionGuard fg(fusion);
+
+  // Grab the reduction, input, and output tensor views. dummy_outputs are
+  // helper tensors for persistent buffer projection.
+  std::vector<TensorView*> dummy_outputs, cached_inputs, reduction_tvs;
+  std::vector<std::pair<TensorView*, TensorView*>> cached_outputs;
+  beforeSchedule(
+      fusion,
+      rparams,
+      dummy_outputs,
+      cached_inputs,
+      reduction_tvs,
+      cached_outputs);
+
+  // split reduction_tvs into inner and outer reduction_tvs
+  std::vector<TensorView*> inner_reduction_tvs, outer_reduction_tvs;
+  for (auto tv : reduction_tvs) {
+    if (scheduler_utils::isFastestDimReduction(tv)) {
+      inner_reduction_tvs.emplace_back(tv);
+    } else {
+      outer_reduction_tvs.emplace_back(tv);
+    }
+  }
+  TORCH_INTERNAL_ASSERT(
+      !inner_reduction_tvs.empty(),
+      "schedulePersistentKernelInnerOuter is called but no inner reduction is found.");
+  TORCH_INTERNAL_ASSERT(
+      !outer_reduction_tvs.empty(),
+      "schedulePersistentKernelInnerOuter is called but no outer reduction is found.");
+
+  // schedule inner reduction, only schedule the first inner reduction tv, then
+  // will be propagated to other inner reduction tvs.
+  TensorView* inner_reference_tv =
+      scheduleReductionGeneral(fusion, rparams, inner_reduction_tvs);
+
+  // schedule outer reduction, schedule all the outer reduction tvs since we
+  // need to store the intermediate results.
+  std::vector<TensorView*> cached_gmem;
+  std::vector<TensorView*> cached_gmem_reload;
+  std::vector<TensorView*> outer_reference_tvs;
+  std::unordered_set<TensorView*> boundaryNodesSet;
+  scheduleReductionCombinedOuter(
+      fusion,
+      rparams,
+      outer_reduction_tvs,
+      cached_gmem,
+      cached_gmem_reload,
+      outer_reference_tvs,
+      boundaryNodesSet);
+
+  // Propagate inner reduction and outer reductions
+  for (auto output : dummy_outputs) {
+    fusion->addOutput(output);
+  }
+
+  const bool unroll = rparams.isUnrolled();
+  const bool vectorize =
+      rparams.vectorize_inner_reduction || rparams.vectorize_iter_dom;
+  const bool is_outer_grid_persistence = rparams.persistent_kernel &&
+      rparams.cross_grid_inner_reduction && !rparams.fastest_dim;
+
+  // Propagate inner reduction. There is a cutoff at boundaryNodesSet, so this
+  // propagation will not propagate to the final outer reduction.
+  reduction_scheduler_utils::propagateTransformation(
+      inner_reference_tv, boundaryNodesSet);
+  reduction_scheduler_utils::propagateRFactor(
+      inner_reference_tv, inner_reduction_tvs[0], inner_reduction_tvs);
+  // For parallelization, we need to explicitly skip tvs in boundaryNodesSet,
+  // otherwise the parallelization will be propagated to the first domain of tvs
+  // in boundaryNodesSet, because that domain was calculated from the partial
+  // reduction and these tensors are not scheduled yet.
+  reduction_scheduler_utils::propagateParallelization(
+      fusion,
+      inner_reduction_tvs[0],
+      inner_reference_tv,
+      unroll,
+      vectorize,
+      is_outer_grid_persistence,
+      inner_reduction_tvs,
+      cached_inputs,
+      cached_outputs,
+      boundaryNodesSet);
+  // Propagate outer reduction. Each outer reduction is connected with its
+  // cached_gmem and output, since we added all the cached_gmem to the
+  // boundaryNodesSet, the transformation from one outer reduction can't
+  // propagate to other outer reductions due to the cutoff at boundaryNodesSet.
+  // Thus, we need a loop to initiate the propagation from each outer reduction.
+  // Parallelization will not propagate to inner reductions as they are
+  // transformed differently.
+  for (long unsigned int i = 0; i < outer_reference_tvs.size(); i++) {
+    reduction_scheduler_utils::propagateTransformation(
+        outer_reference_tvs[i], boundaryNodesSet);
+    reduction_scheduler_utils::propagateParallelization(
+        fusion,
+        outer_reduction_tvs[i],
+        outer_reference_tvs[i],
+        unroll,
+        vectorize,
+        is_outer_grid_persistence,
+        outer_reduction_tvs,
+        cached_inputs,
+        cached_outputs);
+  }
+
+  // special vectorization of temp gmem, vectorization_factor_tmp_gmem_write is
+  // guaranteed to be smaller or equal to input vectorization factor.
+  if (rparams.vectorization_factor_tmp_gmem_write > 1) {
+    for (auto tv : cached_gmem) {
+      TORCH_INTERNAL_ASSERT(
+          rparams.vectorization_factor_tmp_gmem_write <=
+              rparams.unroll_factor_inner_reduction,
+          "vectorization factor of temp gmem write should be smaller than that of inner reduction.")
+      if (rparams.vectorization_factor_tmp_gmem_write <
+          rparams.unroll_factor_inner_reduction) {
+        tv->split(-1, rparams.vectorization_factor_tmp_gmem_write);
+      }
+      tv->axis(-1)->parallelize(ParallelType::Vectorize);
+    }
+  }
+  // vectorization propagate through propagateParallelization only works for
+  // input and output tensors. propagate vectorization to cached_gmem_reload
+  // directly from output tv using parallelizeAllLike. must propagate seperaely
+  // for different tvs as outer reductions are transformed seperately.
+  if (rparams.vectorization_factor_outer > 1) {
+    for (auto tv : cached_gmem_reload) {
+      auto output_tvs = ir_utils::outputTvsOf(tv);
+      TORCH_INTERNAL_ASSERT(
+          !output_tvs.empty(),
+          "cached_gmem_reload should have at least one output tensor.")
+      scheduler_utils::parallelizeAllLike(
+          output_tvs[0],
+          -1,
+          {cached_gmem_reload.begin(), cached_gmem_reload.end()},
+          {ParallelType::Vectorize});
+    }
+  }
+
+  // Remove dummy outputs as they can inadvertently affect CA positions
+  for (auto output : dummy_outputs) {
+    fusion->removeOutput(output);
+  }
+  inlineMost();
+}
 } // namespace nvfuser
diff --git a/csrc/scheduler/normalization.h b/csrc/scheduler/normalization.h
index 76491548c31..7e80e4502fc 100644
--- a/csrc/scheduler/normalization.h
+++ b/csrc/scheduler/normalization.h
@@ -36,4 +36,7 @@ TORCH_CUDA_CU_API void schedulePersistentKernel(
     Fusion* fusion,
     const ReductionParams& rparams);
 
+TORCH_CUDA_CU_API void schedulePersistentKernelInnerOuter(
+    Fusion* fusion,
+    const ReductionParams& rparams);
 } // namespace nvfuser
diff --git a/csrc/scheduler/normalization_utils.cpp b/csrc/scheduler/normalization_utils.cpp
index 389ca6de378..0305998fe35 100644
--- a/csrc/scheduler/normalization_utils.cpp
+++ b/csrc/scheduler/normalization_utils.cpp
@@ -5,12 +5,14 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
+#include <expr_evaluator.h>
 #include <scheduler/debug_utils.h>
 #include <scheduler/normalization_utils.h>
+#include <scheduler/registry.h>
 #include <utils.h>
 
 #include <ATen/cuda/CUDAContext.h>
-
+#include <queue>
 namespace nvfuser {
 namespace normalization_scheduler_utils {
 
@@ -501,5 +503,163 @@ std::optional<GridOuterNormalizationParams> getGridOuterNormalizationParams(
   return std::nullopt;
 }
 
+bool checkIfReductionsAreInnerOuter(
+    const std::vector<TensorView*>& inner_reduction_tvs,
+    const std::vector<TensorView*>& outer_reduction_tvs) {
+  bool pass_combined_heck = true;
+  // inner reduction must be [I,I,...R,R]
+  auto innerReductionCheck = [](TensorView* tv) {
+    int ndim = static_cast<int>(tv->nDims());
+    int lastIter = -1;
+    while (lastIter < ndim - 1 && tv->axis(lastIter + 1)->isIteration()) {
+      lastIter++;
+    }
+    int firstRedu = ndim;
+    while (firstRedu > 0 && tv->axis(firstRedu - 1)->isReduction()) {
+      firstRedu--;
+    }
+    return lastIter >= 0 && firstRedu < ndim && lastIter == firstRedu - 1;
+  };
+  // outer reduction must be [R,R,..I,I]
+  auto outerReductionCheck = [](TensorView* tv) {
+    int ndim = static_cast<int>(tv->nDims());
+    int lastRedu = -1;
+    while (lastRedu < ndim - 1 && tv->axis(lastRedu + 1)->isReduction()) {
+      lastRedu++;
+    }
+    int firstIter = ndim;
+    while (firstIter > 0 && tv->axis(firstIter - 1)->isIteration()) {
+      firstIter--;
+    }
+    return lastRedu >= 0 && firstIter < ndim && lastRedu == firstIter - 1;
+  };
+  for (auto itv : inner_reduction_tvs) {
+    if (!innerReductionCheck(itv)) {
+      pass_combined_heck = false;
+      break;
+    }
+  }
+  for (auto otv : outer_reduction_tvs) {
+    if (!outerReductionCheck(otv)) {
+      pass_combined_heck = false;
+      break;
+    }
+  }
+  return pass_combined_heck;
+}
+
+bool hasSharedInput(
+    const std::vector<TensorView*>& inner_reduction_tvs,
+    const std::vector<TensorView*>& outer_reduction_tvs) {
+  bool has_shared_input = false;
+  std::unordered_set<TensorView*> input_inner_reduction_tvs;
+  for (auto tv : inner_reduction_tvs) {
+    for (auto input_tv : ir_utils::inputTvsOf(tv)) {
+      input_inner_reduction_tvs.emplace(input_tv);
+    }
+  }
+  for (auto tv : outer_reduction_tvs) {
+    for (auto input_tv : ir_utils::inputTvsOf(tv)) {
+      if (input_inner_reduction_tvs.find(input_tv) !=
+          input_inner_reduction_tvs.end()) {
+        has_shared_input = true;
+        break;
+      }
+    }
+    if (has_shared_input) {
+      break;
+    }
+  }
+  return has_shared_input;
+}
+
+std::unordered_set<TensorView*> getAllTvsFrom(
+    const std::vector<TensorView*>& from_tvs,
+    const std::unordered_set<TensorView*>& cutoff_tv_set) {
+  std::unordered_set<TensorView*> tv_group;
+  std::queue<TensorView*> tensors_to_visit;
+  auto addIfNotVisited = [&](TensorView* tv) {
+    if (tv_group.find(tv) == tv_group.end() &&
+        cutoff_tv_set.find(tv) == cutoff_tv_set.end()) {
+      tv_group.emplace(tv);
+      tensors_to_visit.push(tv);
+    }
+  };
+
+  for (auto tv : from_tvs) {
+    tensors_to_visit.push(tv);
+  }
+  while (!tensors_to_visit.empty()) {
+    auto next_tv = tensors_to_visit.front();
+    tensors_to_visit.pop();
+    // visit consumers
+    for (auto tv : ir_utils::consumerTvsOf(next_tv)) {
+      addIfNotVisited(tv);
+    }
+    // visit siblings
+    for (auto tv : ir_utils::siblingTvsOf(next_tv)) {
+      addIfNotVisited(tv);
+    }
+    // visit producer
+    for (auto tv : ir_utils::producerTvsOf(next_tv)) {
+      addIfNotVisited(tv);
+    }
+  }
+  return tv_group;
+}
+
+bool isConnectedOnlyThroughReductionProducer(
+    const std::vector<TensorView*>& inner_reduction_tvs,
+    const std::vector<TensorView*>& outer_reduction_tvs) {
+  const std::unordered_set<TensorView*> outer_tv_set{
+      outer_reduction_tvs.begin(), outer_reduction_tvs.end()};
+  // initialize disjoint sets with tvs connected to inner reduction tvs
+  std::unordered_set<TensorView*> disjoint_tvs =
+      getAllTvsFrom(inner_reduction_tvs, outer_tv_set);
+  // get disjoint sets with tvs connected to outer reduction tvs
+  // check if there is any intersection
+  for (auto otv : outer_reduction_tvs) {
+    const auto& producers = ir_utils::producerTvsOf(otv);
+    // cutoff at producers of outer reduction tvs as they are computed with
+    // inner reducitons
+    const auto& connected_tv_set =
+        getAllTvsFrom({otv}, {producers.begin(), producers.end()});
+    for (auto tv : connected_tv_set) {
+      if (!disjoint_tvs.emplace(tv).second) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+int64_t partialReductionBufferSize(
+    const std::vector<TensorView*>& outer_reduction_tvs,
+    SchedulerRuntimeInfo& runtime_info) {
+  int64_t partial_reduction_buffer_size = 0;
+  for (auto buffer : outer_reduction_tvs) {
+    int64_t buffer_size = -1;
+    for (auto id : buffer->getMaybeRFactorDomain()) {
+      if (id->isReduction() || id->isBroadcast()) {
+        continue;
+      }
+      auto id_size = runtime_info.expressionEvaluator().evaluate(id->extent());
+      TORCH_INTERNAL_ASSERT(
+          id_size.has_value(), "Could not infer persistent buffer size.");
+      if (buffer_size == -1) {
+        buffer_size = id_size->as<int64_t>();
+      } else {
+        buffer_size *= id_size->as<int64_t>();
+      }
+    }
+    buffer_size = buffer_size == -1 ? 0
+                                    : buffer_size *
+            dataTypeSize(buffer->getDataType().value(),
+                         runtime_info.getIndexType());
+    partial_reduction_buffer_size += buffer_size;
+  }
+  return partial_reduction_buffer_size;
+}
+
 } // namespace normalization_scheduler_utils
 } // namespace nvfuser
diff --git a/csrc/scheduler/normalization_utils.h b/csrc/scheduler/normalization_utils.h
index 65eb0d5d9a6..c900570c8b1 100644
--- a/csrc/scheduler/normalization_utils.h
+++ b/csrc/scheduler/normalization_utils.h
@@ -8,13 +8,14 @@
 #pragma once
 
 #include <executor_params.h>
-
+#include <ir_all_nodes.h>
 #include <cmath>
 #include <optional>
 #include <ostream>
 #include <vector>
 
 namespace nvfuser {
+class SchedulerRuntimeInfo;
 namespace normalization_scheduler_utils {
 
 //! Utility class to iterate candidates of launch configurations in a
@@ -152,5 +153,32 @@ std::optional<GridOuterNormalizationParams> getGridOuterNormalizationParams(
     int64_t vectorize_factor,
     int64_t persistent_buffer_size);
 
+//! check iter type of each domain in inner and outer reduction tvs
+//! inner reduction must be [I,I,...R,R]
+//! outer reduction must be [R,R,...I,I]
+bool checkIfReductionsAreInnerOuter(
+    const std::vector<TensorView*>& inner_reduction_tvs,
+    const std::vector<TensorView*>& outer_reduction_tvs);
+
+//! check if the inner reduction has shared input with outer reduction
+bool hasSharedInput(
+    const std::vector<TensorView*>& inner_reduction_tvs,
+    const std::vector<TensorView*>& outer_reduction_tvs);
+
+//! The first part of outer reduction is computed with inner reduction and the
+//! second part is scheduled separately. So, (1) the outer reduction tvs can
+//! only be connected with inner reduction tvs through their producers. (2)
+//! Outer reduction tvs are also scheduled separately and they can only be
+//! connected through their producers.
+bool isConnectedOnlyThroughReductionProducer(
+    const std::vector<TensorView*>& inner_reduction_tvs,
+    const std::vector<TensorView*>& outer_reduction_tvs);
+
+//! in combined_inner_outer_reduction, the partial results of outer reductions
+//! must be persistent, calculate the size of these buffers when estimate
+//! register usage
+int64_t partialReductionBufferSize(
+    const std::vector<TensorView*>& outer_reduction_tvs,
+    SchedulerRuntimeInfo& runtime_info);
 } // namespace normalization_scheduler_utils
 } // namespace nvfuser
diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp
index 336545cd3aa..6986b2f7628 100644
--- a/csrc/scheduler/reduction.cpp
+++ b/csrc/scheduler/reduction.cpp
@@ -1039,11 +1039,20 @@ void scheduleReduction(Fusion* fusion, const ReductionParams& rparams) {
   TORCH_INTERNAL_ASSERT(
       reference_tv != nullptr && reduction_tv != nullptr,
       "Need these two tensor views to finish the scheduling.");
+  const bool vectorize =
+      rparams.vectorize_inner_reduction || rparams.vectorize_iter_dom;
+  const bool is_outer_grid_persistence = rparams.persistent_kernel &&
+      rparams.cross_grid_inner_reduction && !rparams.fastest_dim;
+  TORCH_INTERNAL_ASSERT(
+      !is_outer_grid_persistence,
+      "is_outer_grid_persistence should be false in scheduleReduction.");
   reduction_scheduler_utils::multiReductionInliner(
       fusion,
-      rparams,
       reduction_tv,
       reference_tv,
+      unroll,
+      vectorize,
+      is_outer_grid_persistence,
       reduction_tvs,
       cached_inputs,
       cached_outputs);
diff --git a/csrc/scheduler/reduction_heuristic.h b/csrc/scheduler/reduction_heuristic.h
index 247c0a1276f..f8dcdd8372b 100644
--- a/csrc/scheduler/reduction_heuristic.h
+++ b/csrc/scheduler/reduction_heuristic.h
@@ -13,7 +13,7 @@
 
 namespace nvfuser {
 
-// Parameters of the reduction heuristic to describe the optimial schedule.
+// Parameters of the reduction heuristic to describe the optimal schedule.
 // Warning: equal operator is intended for use in caching the kernel associated
 // with these reduction parameters. It does not check if the launch parameters
 // are equivelent!
@@ -117,6 +117,22 @@ class ReductionParams : public HeuristicParams {
         unroll_factor_outer_reduction > 1;
   }
 
+  // specific to combined inner and outer reduction
+  bool combined_inner_outer = false;
+  // use TIDx for out reduction axis
+  bool tidx_for_outer_reduction = false;
+  // pad outer reduction to warp
+  bool pad_outer_reduction_to_warp = false;
+  // partial result of outer reduction is written to gmem then read back in a
+  // different parallel pattern set the vectorization factor of its read and
+  // write
+  int64_t vectorization_factor_outer = 1;
+  int64_t vectorization_factor_tmp_gmem_write = 1;
+  // inner reduction axis is parallelized by block_dim_inner_reduction (usually
+  // TIDx) the remaining part is further parallelized by
+  // block_dim_inner_reduction_extra (usually TIDy)
+  ParallelType block_dim_inner_reduction_extra = ParallelType::Serial;
+
  public:
   using HeuristicParams::HeuristicParams;
 
@@ -155,7 +171,13 @@ class ReductionParams : public HeuristicParams {
         other.batches_per_block_outer_reduction ==
             batches_per_block_outer_reduction &&
         other.compute_persistent_buffer_with_first_consumer ==
-            compute_persistent_buffer_with_first_consumer;
+            compute_persistent_buffer_with_first_consumer &&
+        other.combined_inner_outer == combined_inner_outer &&
+        other.tidx_for_outer_reduction == tidx_for_outer_reduction &&
+        other.pad_outer_reduction_to_warp == pad_outer_reduction_to_warp &&
+        other.vectorization_factor_outer == vectorization_factor_outer &&
+        other.vectorization_factor_tmp_gmem_write ==
+            vectorization_factor_tmp_gmem_write;
 
     if (other.static_bdimy || static_bdimy) {
       attr_equal = attr_equal && other.lparams.bdimy() == lparams.bdimy();
diff --git a/csrc/scheduler/reduction_utils.cpp b/csrc/scheduler/reduction_utils.cpp
index 869b4c35fe1..1cc1628c4b6 100644
--- a/csrc/scheduler/reduction_utils.cpp
+++ b/csrc/scheduler/reduction_utils.cpp
@@ -132,6 +132,9 @@ TensorView* scheduleReductionTV(
     if (rparams.vectorize_inner_reduction) {
       vectorize(inner_reduce_axis, rparams.unroll_factor_inner_reduction);
     }
+    if (rparams.combined_inner_outer && !rparams.multiple_reds_per_blk) {
+      inner_parallel(inner_reduce_axis, rparams.block_dim_inner_reduction);
+    }
     auto outer_i = inner_reduce_axis;
     if (rparams.cross_grid_inner_reduction) {
       outer_parallel(outer_i++, rparams.grid_dim_inner_reduction);
@@ -147,7 +150,13 @@ TensorView* scheduleReductionTV(
       outer_unroll(outer_i++, rparams.unroll_factor_inner_reduction);
     }
 
-    reduction_tv->axis(outer_i)->parallelize(rparams.block_dim_inner_reduction);
+    if (rparams.combined_inner_outer && !rparams.multiple_reds_per_blk) {
+      reduction_tv->axis(outer_i)->parallelize(
+          rparams.block_dim_inner_reduction_extra);
+    } else {
+      reduction_tv->axis(outer_i)->parallelize(
+          rparams.block_dim_inner_reduction);
+    }
 
     if (rparams.pad_inner_reduction_to_warp) {
       reduction_tv->axis(outer_i)->padToMultipleOfWarp();
@@ -282,8 +291,6 @@ TensorView* scheduleReductionTV(
   return reduction_rf_tv;
 }
 
-namespace {
-
 // Input: a vector of axes in the given tensor ignoring broadcasts. For example,
 //        if you have a tensor T1[b, rS1, rS2, rS3], and you want to specify
 //        axis rS2 and rS3, then your `non_broadcast_axes` should be {1, 2}.
@@ -345,67 +352,106 @@ bool isGridAllreduce(TensorView* reduction_tv) {
   return false;
 }
 
-} // namespace
-
 void multiReductionInliner(
     Fusion* fusion,
-    const ReductionParams& rparams,
     TensorView* reduction_tv,
     TensorView* reference_tv,
+    const bool unroll,
+    const bool vectorize,
+    const bool is_outer_grid_persistence,
     std::vector<TensorView*> reduction_tvs,
     std::vector<TensorView*> cached_inputs,
     std::vector<std::pair<TensorView*, TensorView*>> cached_outputs,
     std::vector<TensorView*> dummy_outputs) {
-  const bool is_outer_grid_persistence = rparams.persistent_kernel &&
-      rparams.cross_grid_inner_reduction && !rparams.fastest_dim;
-
   // Propagate transformations before we rfactor the other reductions
-  TransformPropagator propagator(reference_tv);
-  MaxRootDomainInfoSpanningTree(reference_tv).traverse(&propagator);
-
+  propagateTransformation(reference_tv);
   // If reduction_tv is rfactored, rfactor all reductions.
   if (reference_tv != reduction_tv) {
-    // Apply rfactor to all reductions if applicable
-    // We use axes ignoring broadcasts because in checkPatternEquivalence,
-    // broadcast is ignored, we might end up having multiple reductions with
-    // pattern equivalence but have different number of broadcasts, so the
-    // position in the reference tensor is not necessary the same as the
-    // position in other reduction TVs.
-    std::unordered_set<int> non_broadcast_rfactor_axes;
-    int non_broadcast_pos = 0;
-    for (const auto i : c10::irange(reference_tv->nDims())) {
-      if (reference_tv->axis((int)i)->isBroadcast()) {
-        continue;
-      }
-      if (reference_tv->axis((int)i)->isReduction() &&
-          reference_tv->axis((int)i)->isRFactorProduct()) {
-        non_broadcast_rfactor_axes.insert(non_broadcast_pos);
-      }
-      non_broadcast_pos++;
-    }
+    propagateRFactor(reference_tv, reduction_tv, reduction_tvs);
+  }
 
-    for (auto reduction_tv_ : reduction_tvs) {
-      if (reduction_tv_ == reduction_tv ||
-          reduction_tv_->definition()->isA<GroupedReductionOp>()) {
-        // This should come in already rfactored
-        continue;
-      } else {
-        ir_utils::rfactorHelper(
-            reduction_tv_,
-            addBackBroadcasts(reduction_tv_, non_broadcast_rfactor_axes));
-      }
-    }
+  reduction_scheduler_utils::propagateParallelization(
+      fusion,
+      reduction_tv,
+      reference_tv,
+      unroll,
+      vectorize,
+      is_outer_grid_persistence,
+      reduction_tvs,
+      cached_inputs,
+      cached_outputs);
+
+  // Remove dummy outputs as they can inadvertently affect CA positions
+  for (auto output : dummy_outputs) {
+    fusion->removeOutput(output);
   }
 
-  bool unroll = rparams.isUnrolled();
+  // Inline the schedule
+  inlineMost();
+}
 
-  bool vectorize =
-      rparams.vectorize_inner_reduction || rparams.vectorize_iter_dom;
+void propagateTransformation(
+    TensorView* reference_tv,
+    const std::unordered_set<TensorView*>& boundaryNodesSet) {
+  InternalBoundarySelector ibSelector(boundaryNodesSet);
+  TransformPropagator propagator(reference_tv);
+  MaxRootDomainInfoSpanningTree(reference_tv, &ibSelector)
+      .traverse(&propagator);
+}
 
+void propagateRFactor(
+    TensorView* reference_tv,
+    TensorView* reduction_tv,
+    const std::vector<TensorView*>& reduction_tvs) {
+  // We use axes ignoring broadcasts because in checkPatternEquivalence,
+  // broadcast is ignored, we might end up having multiple reductions with
+  // pattern equivalence but have different number of broadcasts, so the
+  // position in the reference tensor is not necessary the same as the
+  // position in other reduction TVs.
+  std::unordered_set<int> non_broadcast_rfactor_axes_ir;
+  int non_broadcast_pos_ir = 0;
+  for (const auto i : c10::irange(reference_tv->nDims())) {
+    if (reference_tv->axis((int)i)->isBroadcast()) {
+      continue;
+    }
+    if (reference_tv->axis((int)i)->isReduction() &&
+        reference_tv->axis((int)i)->isRFactorProduct()) {
+      non_broadcast_rfactor_axes_ir.insert(non_broadcast_pos_ir);
+    }
+    non_broadcast_pos_ir++;
+  }
+
+  for (auto reduction_tv_ : reduction_tvs) {
+    if (reduction_tv_ == reduction_tv ||
+        reduction_tv_->definition()->isA<GroupedReductionOp>()) {
+      // This should come in already rfactored
+      continue;
+    } else {
+      ir_utils::rfactorHelper(
+          reduction_tv_,
+          reduction_scheduler_utils::addBackBroadcasts(
+              reduction_tv_, non_broadcast_rfactor_axes_ir));
+    }
+  }
+}
+
+void propagateParallelization(
+    Fusion* fusion,
+    TensorView* reduction_tv,
+    TensorView* reference_tv,
+    const bool unroll,
+    const bool vectorize,
+    const bool is_outer_grid_persistence,
+    const std::vector<TensorView*>& reduction_tvs,
+    const std::vector<TensorView*>& cached_inputs,
+    const std::vector<std::pair<TensorView*, TensorView*>>& cached_outputs,
+    const std::unordered_set<TensorView*>& unselected_tvs) {
   // Propagate parallelization except vectorization and unrolling
+  auto selected_tvs =
+      ir_utils::allTvsExcept(reference_tv->fusion(), unselected_tvs);
   scheduler_utils::parallelizeAllLike(
       reference_tv,
-      {},
+      selected_tvs,
       allParallelTypesExcept(
           {ParallelType::Unroll,
            ParallelType::Vectorize,
@@ -467,6 +513,7 @@ void multiReductionInliner(
         reference_tv, reduction_tv};
     // If reference shouldn't be unrolled, clear that parallel type.
     // In the case of outer grid persistence, replace Vector with Group
+
     for (auto tv : rfactor_and_reduction_tvs) {
       if (are_unrolled.count(tv) == 0) {
         for (const auto i : c10::irange(tv->nDims())) {
@@ -499,20 +546,15 @@ void multiReductionInliner(
         reduction_tvs.begin(),
         reduction_tvs.end(),
         std::back_inserter(allreduce_tvs),
-        [&](auto tv) { return reduction_tv != tv && isGridAllreduce(tv); });
+        [&](auto tv) {
+          return reduction_tv != tv &&
+              reduction_scheduler_utils::isGridAllreduce(tv);
+        });
     if (!allreduce_tvs.empty()) {
       scheduler_utils::parallelizeAllLike(
           reduction_tv, -1, allreduce_tvs, {ParallelType::Group});
     }
   }
-
-  // Remove dummy outputs as they can inadvertently affect CA positions
-  for (auto output : dummy_outputs) {
-    fusion->removeOutput(output);
-  }
-
-  // Inline the schedule
-  inlineMost();
 }
 
 namespace {
diff --git a/csrc/scheduler/reduction_utils.h b/csrc/scheduler/reduction_utils.h
index 39371b7f719..b290e0d7fbb 100644
--- a/csrc/scheduler/reduction_utils.h
+++ b/csrc/scheduler/reduction_utils.h
@@ -28,14 +28,48 @@ TensorView* scheduleReductionTV(
 // Inlining function intended for single or multi reduction fusions.
 TORCH_CUDA_CU_API void multiReductionInliner(
     Fusion* fusion,
-    const ReductionParams& rparams,
     TensorView* reduction_tv,
     TensorView* reference_tv,
+    const bool unroll,
+    const bool vectorize,
+    const bool is_outer_grid_persistence,
     std::vector<TensorView*> reduction_tvs,
     std::vector<TensorView*> cached_inputs,
     std::vector<std::pair<TensorView*, TensorView*>> cached_outputs,
     std::vector<TensorView*> dummy_outputs = {});
 
+// Propagate transformations with internal cutoff boundary at boundaryNodesSet
+// in P2C forward propagate, disable propagation to TensorView in
+// boundaryNodesSet in C2P backward propagate, disable propagation from
+// TensorView in boundaryNodesSet
+TORCH_CUDA_CU_API void propagateTransformation(
+    TensorView* reference_tv,
+    const std::unordered_set<TensorView*>& boundaryNodesSet =
+        std::unordered_set<TensorView*>());
+
+// Propagate RFactor from first reduction TensorView to others
+TORCH_CUDA_CU_API void propagateRFactor(
+    TensorView* reference_tv,
+    TensorView* reduction_tv,
+    const std::vector<TensorView*>& reduction_tvs);
+
+// Propagate Parallelization from reference TensorView to other TensorViews
+// Parallel types Unroll, Vectorize, and MisalignedVectorize are explicitly
+// handled for tensorviews in cached_inputs and cached_outputs.
+// If reduction_tv and reference_tv shouldn't be unrolled, clear that parallel
+// type. unroll and vectorize are members of ReductionParams
+TORCH_CUDA_CU_API void propagateParallelization(
+    Fusion* fusion,
+    TensorView* reduction_tv,
+    TensorView* reference_tv,
+    const bool unroll,
+    const bool vectorize,
+    const bool is_outer_grid_persistence,
+    const std::vector<TensorView*>& reduction_tvs,
+    const std::vector<TensorView*>& cached_inputs,
+    const std::vector<std::pair<TensorView*, TensorView*>>& cached_outputs,
+    const std::unordered_set<TensorView*>& unselected_tvs = {});
+
 // Sort and rfactor the reference tv in a consistent way for reduction inliner.
 // Order of the sort is:
 //
diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
index a5efb4680f0..f4864317cab 100644
--- a/csrc/scheduler/registry.cpp
+++ b/csrc/scheduler/registry.cpp
@@ -1796,6 +1796,28 @@ class PersistentKernelScheduler : public SchedulerEntry {
       return false;
     }
 
+    std::vector<TensorView*> inner_reduction_tvs;
+    std::vector<TensorView*> outer_reduction_tvs;
+    for (auto tv : reduction_tvs) {
+      if (scheduler_utils::isFastestDimReduction(tv)) {
+        inner_reduction_tvs.emplace_back(tv);
+      } else {
+        outer_reduction_tvs.emplace_back(tv);
+      }
+    }
+    bool combined_inner_outer =
+        !inner_reduction_tvs.empty() && !outer_reduction_tvs.empty();
+    if (combined_inner_outer &&
+        !checkReductionPattern(
+            fusion, inner_reduction_tvs, outer_reduction_tvs)) {
+      return false;
+    }
+    // If there is both inner and outer reduction, we use the first inner
+    // reduction tv as reference, otherwise we use the first reduction tv,
+    // whether it is inner or outer.
+    TensorView* reference_tv =
+        combined_inner_outer ? inner_reduction_tvs[0] : reduction_tvs[0];
+
     if (!ir_utils::getViewOps(fusion).empty()) {
       ComputeAtMap ca_map(fusion);
       if (requiresForwardViewReplay(fusion, ca_map)) {
@@ -1805,9 +1827,9 @@ class PersistentKernelScheduler : public SchedulerEntry {
         return false;
       }
 
-      // Persistent scheduler simply uses reduction_tvs[0] as the reference, if
+      // Persistent scheduler simply uses reference_tv as the reference, if
       // that changes, this needs to be changed.
-      if (reductionInterferingView(fusion, ca_map, reduction_tvs[0])) {
+      if (reductionInterferingView(fusion, ca_map, reference_tv)) {
         scheduler_debug_utils::canScheduleRejectReason(
             ScheduleHeuristic::Persistent,
             "View may interfere with normalization scheduling.");
@@ -1844,25 +1866,6 @@ class PersistentKernelScheduler : public SchedulerEntry {
       }
     }
 
-    // Use root domain map to check the reduction ops have the same axes
-    FusionGuard fg(fusion);
-    ComputeAtRootDomainMap root_map;
-    root_map.build(true);
-
-    // red_ops.size()>1 checked before
-    for (const auto it : c10::irange(1, reduction_tvs.size())) {
-      if (!checkPatternEquivalence(
-              reduction_tvs[it - 1], reduction_tvs[it], root_map)) {
-        scheduler_debug_utils::canScheduleRejectReason(
-            ScheduleHeuristic::Persistent,
-            "unmapped reduction ",
-            reduction_tvs[it - 1],
-            " and ",
-            reduction_tvs[it]);
-        return false;
-      }
-    }
-
     // Only accept persistent kernels
     auto persistent_buffer_info = scheduler_utils::persistentBuffers(fusion);
     if (persistent_buffer_info.persistent_buffers.empty()) {
@@ -1886,7 +1889,6 @@ class PersistentKernelScheduler : public SchedulerEntry {
       SchedulerRuntimeInfo& runtime_info,
       HeuristicSummary* data_cache = nullptr) {
     FUSER_PERF_SCOPE("PersistentKernelScheduler::canSchedule");
-
     auto reduction_tv_entry =
         HeuristicSummaryEntry<HeuristicCompileTime::ReductionTVs>(
             data_cache, [&fusion]() {
@@ -1895,41 +1897,49 @@ class PersistentKernelScheduler : public SchedulerEntry {
             });
 
     auto& reduction_tvs = reduction_tv_entry.get();
+    bool inner_reduction = false;
+    bool outer_reduction = false;
+    TensorView* first_inner_reduction_tv;
+    for (auto tv : reduction_tvs) {
+      if (scheduler_utils::isFastestDimReduction(tv)) {
+        first_inner_reduction_tv = tv;
+        inner_reduction = true;
+      } else {
+        outer_reduction = true;
+      }
+    }
+    if (inner_reduction && outer_reduction) {
+      if (!checkCombinedReductionShape(runtime_info, reduction_tvs)) {
+        scheduler_debug_utils::canScheduleRejectReason(
+            ScheduleHeuristic::Persistent,
+            "Inner dim of combined reduction should be a multiplication of a quarter warp and max vectorization factor!");
+        return false;
+      }
+    }
+    // If there is both inner and outer reduction, we use the first inner
+    // reduction tv to get properties, otherwise we use the first reduction tv,
+    // whether it is inner or outer.
+    auto reference_tv = inner_reduction && outer_reduction
+        ? first_inner_reduction_tv
+        : reduction_tvs[0];
+
     auto properties =
-        scheduler_utils::getProperties(fusion, runtime_info, reduction_tvs[0]);
+        scheduler_utils::getProperties(fusion, runtime_info, reference_tv);
 
     if (!properties.fastest_dim_reduction) {
       return canScheduleRunTimeOuter(
           fusion, runtime_info, data_cache, reduction_tvs, properties);
     }
 
-    auto persistent_buffer_info_entry =
-        HeuristicSummaryEntry<HeuristicCompileTime::PersistentBufferInfo>(
-            data_cache, [&fusion]() {
-              return std::make_unique<scheduler_utils::PersistentBufferInfo>(
-                  scheduler_utils::persistentBuffers(fusion));
-            });
-
-    auto& persistent_buffer_info = persistent_buffer_info_entry.get();
-
-    auto persistent_buffer_size_info = scheduler_utils::persistentBufferSize(
-        fusion, runtime_info, persistent_buffer_info, data_cache);
-
-    // Note that projected buffer size can be zero
-    auto persistent_buffer_size =
-        persistent_buffer_size_info.projected_persistent_buffer_size == 0
-        ? persistent_buffer_size_info.persistent_buffer_size
-        : std::min(
-              persistent_buffer_size_info.persistent_buffer_size,
-              persistent_buffer_size_info.projected_persistent_buffer_size);
+    // pair of persistent_buffer_size and available_persistent_buffer_size
+    const std::pair<int64_t, int64_t> buffer_size = getPersistentBufferSize(
+        fusion, runtime_info, data_cache, reduction_tvs);
+    const int64_t persistent_buffer_size = buffer_size.first;
+    const int64_t available_persistent_buffer_size = buffer_size.second;
 
     const int64_t device_multiprocessor_count =
         (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
 
-    // TODO: Enable grid persistence
-    const auto available_persistent_buffer_size =
-        scheduler_utils::register_file_size;
-
     if (persistent_buffer_size > available_persistent_buffer_size) {
       scheduler_debug_utils::canScheduleRejectReason(
           ScheduleHeuristic::Persistent, "not enough registers for persistece");
@@ -2000,6 +2010,153 @@ class PersistentKernelScheduler : public SchedulerEntry {
     TORCH_INTERNAL_ASSERT(params_ != nullptr);
   }
 
+  static bool checkReductionPattern(
+      Fusion* fusion,
+      const std::vector<TensorView*>& inner_reduction_tvs,
+      const std::vector<TensorView*>& outer_reduction_tvs) {
+    // Use root domain map to check the reduction ops have the same axes
+    FusionGuard fg(fusion);
+    ComputeAtRootDomainMap root_map;
+    root_map.build(true);
+
+    // check inner and outer reductions seperately
+    for (const auto& rtvs : {inner_reduction_tvs, outer_reduction_tvs}) {
+      for (const auto it : c10::irange(1, rtvs.size())) {
+        if (!checkPatternEquivalence(rtvs[it - 1], rtvs[it], root_map)) {
+          scheduler_debug_utils::canScheduleRejectReason(
+              ScheduleHeuristic::Persistent,
+              "unmapped reduction ",
+              rtvs[it - 1],
+              " and ",
+              rtvs[it]);
+          return false;
+        }
+      }
+    }
+    // combined inner and outer reduction is of general purpose but only tested
+    // for layer norm backward
+    if (!inner_reduction_tvs.empty() && !outer_reduction_tvs.empty()) {
+      if (!normalization_scheduler_utils::checkIfReductionsAreInnerOuter(
+              inner_reduction_tvs, outer_reduction_tvs)) {
+        scheduler_debug_utils::canScheduleRejectReason(
+            ScheduleHeuristic::Persistent,
+            "to use combined reduction, inner reduction tensor should be [I,I,...,R,R] and outer reduction tensor should be [R,R,...,I,I]");
+        return false;
+      }
+
+      if (!normalization_scheduler_utils::hasSharedInput(
+              inner_reduction_tvs, outer_reduction_tvs)) {
+        scheduler_debug_utils::canScheduleRejectReason(
+            ScheduleHeuristic::Persistent,
+            "to use combined reduction, inner reduction and outer reduction should have shared input.");
+        return false;
+      }
+
+      if (!normalization_scheduler_utils::
+              isConnectedOnlyThroughReductionProducer(
+                  inner_reduction_tvs, outer_reduction_tvs)) {
+        scheduler_debug_utils::canScheduleRejectReason(
+            ScheduleHeuristic::Persistent,
+            "to use combined reduction, inner reduction and outer reduction should not have shared consumer, their consumers should not have shared non-outer-reduction producer.");
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static bool checkCombinedReductionShape(
+      SchedulerRuntimeInfo& runtime_info,
+      const std::vector<TensorView*>& reduction_tvs) {
+    // In combined_inner_outer_reduction, the inner dim should be a
+    // multiplication of a quarter warp and vectorization factor. Otherwise,
+    // will use segregated version. Since inner reduction dim is splitted by
+    // bdimx, this ensures the largest possible bdimx can be at least of a
+    // quarter warp. So we have enough bdimx threads to cover the iteration
+    // domain of the outer reductions to avoid low performance.
+    const int64_t quarter_warp =
+        at::cuda::getCurrentDeviceProperties()->warpSize / 4;
+    for (auto tv : reduction_tvs) {
+      int64_t n_elements = 1;
+      const int64_t vectorization_factor = 16 /
+          dataTypeSize(tv->getDataType().value(), runtime_info.getIndexType());
+      const int64_t n_elements_factor = quarter_warp * vectorization_factor;
+      const bool is_inner_reduction =
+          scheduler_utils::isFastestDimReduction(tv);
+      for (auto id : tv->getMaybeRFactorDomain()) {
+        // check reduction domain for inner reduction and iteration domain for
+        // outer reduction
+        if (id->isReduction() == is_inner_reduction) {
+          auto id_size =
+              runtime_info.expressionEvaluator().evaluate(id->extent());
+          TORCH_INTERNAL_ASSERT(
+              id_size.has_value(), "Could not infer reduction dim size.");
+          n_elements *= id_size->as<int64_t>();
+        }
+      }
+      if (n_elements % n_elements_factor) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static std::pair<int64_t, int64_t> getPersistentBufferSize(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache,
+      const std::vector<TensorView*>& reduction_tvs) {
+    auto persistent_buffer_info_entry =
+        HeuristicSummaryEntry<HeuristicCompileTime::PersistentBufferInfo>(
+            data_cache, [&fusion]() {
+              return std::make_unique<scheduler_utils::PersistentBufferInfo>(
+                  scheduler_utils::persistentBuffers(fusion));
+            });
+
+    auto& persistent_buffer_info = persistent_buffer_info_entry.get();
+
+    auto persistent_buffer_size_info = scheduler_utils::persistentBufferSize(
+        fusion, runtime_info, persistent_buffer_info, data_cache);
+
+    // Note that projected buffer size can be zero
+    auto persistent_buffer_size =
+        persistent_buffer_size_info.projected_persistent_buffer_size == 0
+        ? persistent_buffer_size_info.persistent_buffer_size
+        : std::min(
+              persistent_buffer_size_info.persistent_buffer_size,
+              persistent_buffer_size_info.projected_persistent_buffer_size);
+
+    // in combined_inner_outer_reduction, the partial results of outer
+    // reductions must be persistent, allow register spill avoid segmentation
+    int64_t inner_reduction_count = 0;
+    int64_t outer_reduction_count = 0;
+    std::vector<TensorView*> outer_reduction_tvs;
+    for (auto tv : reduction_tvs) {
+      if (scheduler_utils::isFastestDimReduction(tv)) {
+        inner_reduction_count++;
+      } else {
+        outer_reduction_count++;
+        outer_reduction_tvs.emplace_back(tv);
+      }
+    }
+    const bool combined_inner_outer_reduction =
+        inner_reduction_count && outer_reduction_count;
+    if (combined_inner_outer_reduction) {
+      persistent_buffer_size +=
+          normalization_scheduler_utils::partialReductionBufferSize(
+              outer_reduction_tvs, runtime_info);
+    }
+    // At this point, we use the full register file size only for the
+    // inner-outer case. It does not mean the full size shouldn't be used
+    // otherwise, but more detailed tuning of the heuristics would be required.
+    const int64_t available_persistent_buffer_size =
+        combined_inner_outer_reduction
+        ? scheduler_utils::register_file_size_full
+        : scheduler_utils::register_file_size;
+
+    return std::make_pair(
+        persistent_buffer_size, available_persistent_buffer_size);
+  }
+
   static bool canScheduleRunTimeOuter(
       Fusion* fusion,
       SchedulerRuntimeInfo& runtime_info,
diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h
index a5c926ec1f9..e965dcf8765 100644
--- a/csrc/scheduler/utils.h
+++ b/csrc/scheduler/utils.h
@@ -28,7 +28,9 @@ namespace scheduler_utils {
 // with a compile time coonstant index. Unfortunately nvcc seems to be using
 // many registers for indexing. This is a bad estimation of extra register use,
 // but it's hard to get a better one.
-constexpr int64_t register_file_size = 256 * 1024 / 2;
+constexpr int64_t register_file_size_full = 256 * 1024;
+constexpr int64_t register_file_size = register_file_size_full / 2;
+
 constexpr int64_t x_grid_limit = ((int64_t)1 << (int64_t)31) - (int64_t)1;
 constexpr int64_t y_grid_limit = 65535;
 constexpr int64_t z_grid_limit = 65535;
@@ -46,6 +48,29 @@ constexpr int64_t lastPow2(int64_t n) {
   return std::max((int64_t)1, n - (n >> 1));
 }
 
+// round up to multiple of 8 or pow2 whichever smaller
+constexpr int64_t roundUpPow2Or8(const int64_t x) {
+  auto round_up_pow2 = lastPow2(x);
+  if (round_up_pow2 < x) {
+    round_up_pow2 *= 2;
+  }
+  constexpr int64_t kEight = 8;
+  auto round_up_8 = x % kEight == 0 ? x : x + (kEight - x % kEight);
+  return std::min(round_up_8, round_up_pow2);
+}
+
+constexpr int64_t roundUpPow2(const int64_t x) {
+  auto round_up_pow2 = scheduler_utils::lastPow2(x);
+  if (round_up_pow2 < x) {
+    round_up_pow2 *= 2;
+  }
+  return round_up_pow2;
+}
+
+constexpr int64_t roundUpToN(const int64_t x, const int64_t n) {
+  return x % n == 0 ? x : x + (n - x % n);
+}
+
 // Div x by y, but min at 1
 inline int64_t safeDiv(const int64_t x, const int64_t y) {
   return std::max(x / y, (int64_t)1);
diff --git a/csrc/utils.cpp b/csrc/utils.cpp
index 1293a4c580d..3865fa6923f 100644
--- a/csrc/utils.cpp
+++ b/csrc/utils.cpp
@@ -5,10 +5,10 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
-
-#include <utils.h>
-
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/util/string_view.h>
+#include <cuda_occupancy.h>
+#include <utils.h>
 
 #include <cstdlib>
 #include <iostream>
@@ -367,4 +367,43 @@ std::vector<int64_t> getTensorSizes(at::TensorTypePtr const& tensor_type) {
   return optional_sizes.value();
 }
 
+int64_t getRegPerThreadGivenThreadsPerSM(int64_t threads_per_sm) {
+  int num_partition = 0;
+  int reg_allocation_granularity = 0;
+  const auto prop = at::cuda::getCurrentDeviceProperties();
+  cudaOccDeviceProp occ_prop(*prop);
+  cudaOccSubPartitionsPerMultiprocessor(&num_partition, &occ_prop);
+  cudaOccRegAllocationGranularity(&reg_allocation_granularity, &occ_prop);
+  int warp_size = prop->warpSize;
+  int num_warps = ceilDiv(threads_per_sm, warp_size);
+
+  // warps could be distributed unevenly across partition
+  int max_warps_per_sm_partition = ceilDiv(num_warps, num_partition);
+  // registers are evenly distributed across partitions, partition with most
+  // wraps determins the maximum register available per warp
+  int max_reg_per_warp =
+      prop->regsPerBlock / num_partition / max_warps_per_sm_partition;
+  // clamp down to register allocation granularity at warp level
+  int effective_max_reg_per_warp = max_reg_per_warp /
+      reg_allocation_granularity * reg_allocation_granularity;
+  return effective_max_reg_per_warp / warp_size;
+}
+
+int64_t getThreadsPerSMGivenRegPerThread(int64_t reg_per_thread) {
+  int num_partition = 0;
+  int reg_allocation_granularity = 0;
+  const auto prop = at::cuda::getCurrentDeviceProperties();
+  cudaOccDeviceProp occ_prop(*prop);
+  cudaOccSubPartitionsPerMultiprocessor(&num_partition, &occ_prop);
+  cudaOccRegAllocationGranularity(&reg_allocation_granularity, &occ_prop);
+  int warp_size = prop->warpSize;
+
+  int reg_per_warp =
+      ceilDiv(reg_per_thread * warp_size, reg_allocation_granularity) *
+      reg_allocation_granularity;
+  int warps_per_sm_partition =
+      prop->regsPerBlock / reg_per_warp / num_partition;
+  int num_warps = warps_per_sm_partition * num_partition;
+  return num_warps * warp_size;
+}
 } // namespace nvfuser
diff --git a/csrc/utils.h b/csrc/utils.h
index 7cb20167796..758cd2f9846 100644
--- a/csrc/utils.h
+++ b/csrc/utils.h
@@ -141,9 +141,14 @@ enum class EnableOption {
 TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);
 TORCH_CUDA_CU_API const std::vector<std::string>& getEnableOptionArguments(
     EnableOption option);
+TORCH_CUDA_CU_API int64_t
+getRegPerThreadGivenThreadsPerSM(int64_t threads_per_sm);
 
-// Check if fallback path should be used which will dispatch to eagermode if any
-// errors are encountered. Helpful for debugging.
+TORCH_CUDA_CU_API int64_t
+getThreadsPerSMGivenRegPerThread(int64_t reg_per_thread);
+
+// Check if fallback path should be used which will dispatch to eager mode if
+// any errors are encountered. Helpful for debugging.
 bool useFallback();
 
 //! Ceil integer division
diff --git a/test/test_gpu_combined_inner_outer_reduction.cpp b/test/test_gpu_combined_inner_outer_reduction.cpp
new file mode 100644
index 00000000000..d2e44287dc2
--- /dev/null
+++ b/test/test_gpu_combined_inner_outer_reduction.cpp
@@ -0,0 +1,957 @@
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <grouped_reduction.h>
+#include <inlining.h>
+#include <ir_utils.h>
+#include <kernel_cache.h>
+#include <ops/all_ops.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/utils.h>
+#include <test/utils.h>
+#include <test/validator.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAStream.h>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <thread>
+
+namespace nvfuser {
+
+using namespace at::indexing;
+
+// mean & var
+std::tuple<float, float> getMeanVar(const std::vector<float>& v) {
+  const int nele = v.size();
+  float mean = std::accumulate(v.begin(), v.end(), 0.0f) / nele;
+  std::vector<float> sub_mean(nele);
+  std::transform(v.begin(), v.end(), sub_mean.begin(), [mean](float x) {
+    return x - mean;
+  });
+  float sq_sum = std::inner_product(
+      sub_mean.begin(), sub_mean.end(), sub_mean.begin(), 0.0);
+  float stdev = std::sqrt(sq_sum / nele);
+  return {mean, stdev};
+}
+
+// This case is to test the correctness of the combined inner and outer
+// scheduler used in layer norm backward. It can also be configured to test the
+// performance using different data types.
+TEST_F(NVFuserTest, FusionCombinedSchedulerLayerNormBackward_CUDA) {
+  auto runTest = [](const std::vector<int64_t>& batch_shape,
+                    const std::vector<int64_t>& norm_shape,
+                    DataType dtype,
+                    bool isBenchmark,
+                    int verbose) {
+    std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+    Fusion& fusion = *fusion_ptr.get();
+    FusionGuard fg(&fusion);
+
+    std::vector<int64_t> input_shape(batch_shape);
+    std::copy(
+        norm_shape.begin(), norm_shape.end(), std::back_inserter(input_shape));
+
+    const size_t kM = input_shape.size();
+    const size_t kN = norm_shape.size();
+    const size_t kOuterNumDims = kM - kN;
+    std::vector<int64_t> outer_shape;
+    for (const auto idx : c10::irange(kOuterNumDims)) {
+      outer_shape.push_back(input_shape[idx]);
+    }
+    for (const auto idx : c10::irange(kOuterNumDims, kM)) {
+      // just to avoid unused variable warning
+      outer_shape.push_back(1 + idx - idx);
+    }
+
+    auto grad_out = makeContigTensor(input_shape.size(), dtype);
+    auto input = makeContigTensor(input_shape.size(), dtype);
+    auto mean = makeConcreteTensor(
+        outer_shape, dtype == DataType::Half ? DataType::Float : dtype);
+    auto rstd = makeConcreteTensor(
+        outer_shape, dtype == DataType::Half ? DataType::Float : dtype);
+    auto weight = makeContigTensor(norm_shape.size(), dtype);
+    auto bias = makeContigTensor(norm_shape.size(), dtype);
+    fusion.addInput(grad_out);
+    fusion.addInput(input);
+    fusion.addInput(mean);
+    fusion.addInput(rstd);
+    fusion.addInput(weight);
+    fusion.addInput(bias);
+
+    if (dtype == DataType::Half) {
+      grad_out = castOp(DataType::Float, grad_out);
+      input = castOp(DataType::Float, input);
+      weight = castOp(DataType::Float, weight);
+      bias = castOp(DataType::Float, bias);
+    }
+
+    auto layer_norm_results = layer_norm_backward(
+        grad_out,
+        input,
+        norm_shape,
+        mean,
+        rstd,
+        weight,
+        bias,
+        {true, true, true});
+
+    if (dtype == DataType::Half) {
+      layer_norm_results.grad_input =
+          castOp(dtype, layer_norm_results.grad_input);
+      layer_norm_results.grad_bias =
+          castOp(dtype, layer_norm_results.grad_bias);
+      layer_norm_results.grad_weight =
+          castOp(dtype, layer_norm_results.grad_weight);
+    }
+
+    fusion.addOutput(layer_norm_results.grad_input);
+    fusion.addOutput(layer_norm_results.grad_weight);
+    fusion.addOutput(layer_norm_results.grad_bias);
+
+    auto maybe_fp16_options = at::TensorOptions()
+                                  .dtype(data_type_to_aten(dtype))
+                                  .device(at::kCUDA, 0);
+    at::Tensor aten_grad_out = at::randn(input_shape, maybe_fp16_options);
+    at::Tensor aten_input = at::randn(input_shape, maybe_fp16_options);
+    at::Tensor aten_weight = at::randn(norm_shape, maybe_fp16_options);
+    at::Tensor aten_bias = at::randn(norm_shape, maybe_fp16_options);
+    auto at_weight = c10::optional<at::Tensor>(aten_weight);
+    auto at_bias = c10::optional<at::Tensor>(aten_bias);
+
+    const float kEps = 1e-5;
+    auto aten_results =
+        at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
+    auto aten_output = std::get<0>(aten_results);
+    auto aten_mean = std::get<1>(aten_results);
+    auto aten_rstd = std::get<2>(aten_results);
+
+    FusionExecutorCache fec(std::move(fusion_ptr));
+    std::vector<c10::IValue> aten_inputs = {
+        aten_grad_out,
+        aten_input,
+        aten_mean,
+        aten_rstd,
+        aten_weight,
+        aten_bias};
+    auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+    auto aten_gradients = at::native_layer_norm_backward(
+        aten_grad_out,
+        aten_input,
+        norm_shape,
+        aten_mean,
+        aten_rstd,
+        c10::optional<at::Tensor>(aten_weight),
+        c10::optional<at::Tensor>(aten_bias),
+        {true, true, true});
+
+    testValidate(
+        &fusion,
+        cg_outputs,
+        aten_inputs,
+        {std::get<0>(aten_gradients),
+         std::get<1>(aten_gradients),
+         std::get<2>(aten_gradients)},
+        __LINE__,
+        __FILE__);
+
+    bool is_segmented = fec.getMostRecentKernelRuntime()->isSegmented();
+    TORCH_CHECK(!is_segmented, "Fusion is segmented");
+
+    if (isBenchmark) {
+      FusionKernelRuntime* fkr = fec.getMostRecentKernelRuntime();
+      fkr->setMeasureKernelTime(true);
+
+      constexpr int nwarm = 5;
+      constexpr int niter = 10;
+      std::vector<float> bw(niter, 0.f);
+      std::vector<float> timeus(niter, 0.f);
+
+      size_t read_write_bytes = 0;
+      const std::vector<at::Tensor> aten_inputs_tmp = {
+          aten_grad_out,
+          aten_input,
+          aten_mean,
+          aten_rstd,
+          aten_weight,
+          aten_bias};
+      const std::vector<at::Tensor> aten_output_tmp = {
+          std::get<0>(aten_gradients),
+          std::get<1>(aten_gradients),
+          std::get<2>(aten_gradients)};
+      for (auto input : aten_inputs_tmp) {
+        read_write_bytes += input.numel() * input.element_size();
+      }
+      for (auto output : aten_output_tmp) {
+        read_write_bytes += output.numel() * output.element_size();
+      }
+
+      for (int i = 0; i < nwarm + niter; i++) {
+        clearL2Cache();
+        // fe.runFusion(inputs, outputs, launch_constraints);
+        auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+        if (i >= nwarm) {
+          float runTimeus = 0.0f;
+          int num_kernels = fkr->executors().size();
+          for (int i = 0; i < num_kernels; i++) {
+            const FusionExecutor& fe = fkr->executors()[i];
+            runTimeus += fe.kernelTimeMs() * 1e3;
+          }
+          float bandwidth = read_write_bytes / 1e9 / (runTimeus * 1e-6);
+          timeus[i - nwarm] = runTimeus;
+          bw[i - nwarm] = bandwidth;
+          if (verbose == 2)
+            std::cout << "iter= " << i << ", bandwidth= " << bandwidth << "GB/s"
+                      << ", time= " << runTimeus << " us" << std::endl;
+        }
+      }
+      return getMeanVar(timeus);
+    } else {
+      if (verbose == 1) {
+        std::stringstream sdim0, sdim1;
+        std::for_each(
+            batch_shape.begin(), batch_shape.end(), [&sdim0](int64_t n) {
+              sdim0 << n << " x ";
+            });
+        std::for_each(
+            norm_shape.begin(), norm_shape.end(), [&sdim1](int64_t n) {
+              sdim1 << n << " x ";
+            });
+        std::string str1 = sdim1.str();
+        str1.erase(str1.end() - 2);
+        std::cout << "passed, shape= " << sdim0.str() << str1 << std::endl;
+      }
+      return std::make_tuple(-1.0f, -1.0f);
+    }
+  };
+
+  std::vector<DataType> data_types = {DataType::Half, DataType::Float};
+  std::vector<std::vector<int64_t>> batch_sizes = {{8, 1024}};
+  std::vector<std::vector<int64_t>> hidden_sizes = {
+      {2048}, {576}, {768}, {1024}, {1280}, {1600}};
+
+  bool isBenchmark = false;
+  bool onlyTestFirstCase = false;
+  int verbose = 0;
+  for (auto dtype : data_types) {
+    for (auto batch_shape : batch_sizes) {
+      for (auto norm_shape : hidden_sizes) {
+        std::tuple<float, float> avg_var =
+            runTest(batch_shape, norm_shape, dtype, isBenchmark, verbose);
+        if (isBenchmark) {
+          std::stringstream sdim0, sdim1;
+          std::for_each(
+              batch_shape.begin(), batch_shape.end(), [&sdim0](int64_t n) {
+                sdim0 << n << " x ";
+              });
+          std::for_each(
+              norm_shape.begin(), norm_shape.end(), [&sdim1](int64_t n) {
+                sdim1 << n << " x ";
+              });
+          std::cout << "shape= " << sdim0.str() << sdim1.str()
+                    << ", time_us mean(var)= " << std::get<0>(avg_var) << " ("
+                    << std::get<1>(avg_var) << ")" << std::endl;
+        }
+        if (onlyTestFirstCase)
+          break;
+      }
+      if (onlyTestFirstCase)
+        break;
+    }
+    if (onlyTestFirstCase)
+      break;
+  }
+}
+
+// This case is to test the correctness of the combined inner and outer
+// scheduler, if link_inner_outer = true, the inner and outer reductions are
+// linked, otherwise the two outer reductions are linked. In either case, the
+// fusion should be segmented since the current combined scheduler assumes there
+// is no shared consumer between inter reductions and outer reductions and among
+// tensors in outer reductions.
+TEST_F(NVFuserTest, FusionCombinedSchedulerSharedConsumer_CUDA) {
+  auto runTest = [](const std::vector<int64_t>& batch_shape,
+                    const std::vector<int64_t>& norm_shape,
+                    DataType dtype,
+                    bool link_inner_outer) {
+    std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+    Fusion& fusion = *fusion_ptr.get();
+    FusionGuard fg(&fusion);
+
+    std::vector<int64_t> input_shape(batch_shape);
+    std::copy(
+        norm_shape.begin(), norm_shape.end(), std::back_inserter(input_shape));
+
+    const size_t kM = input_shape.size();
+    const size_t kN = norm_shape.size();
+    const size_t kOuterNumDims = kM - kN;
+    std::vector<int64_t> outer_shape;
+    for (const auto idx : c10::irange(kOuterNumDims)) {
+      outer_shape.push_back(input_shape[idx]);
+    }
+    for (const auto idx : c10::irange(kOuterNumDims, kM)) {
+      // just to avoid unused variable warning
+      outer_shape.push_back(1 + idx - idx);
+    }
+
+    auto grad_out = makeContigTensor(input_shape.size(), dtype);
+    auto input = makeContigTensor(input_shape.size(), dtype);
+    auto mean = makeConcreteTensor(
+        outer_shape, dtype == DataType::Half ? DataType::Float : dtype);
+    auto rstd = makeConcreteTensor(
+        outer_shape, dtype == DataType::Half ? DataType::Float : dtype);
+    auto weight = makeContigTensor(norm_shape.size(), dtype);
+    auto bias = makeContigTensor(norm_shape.size(), dtype);
+    fusion.addInput(grad_out);
+    fusion.addInput(input);
+    fusion.addInput(mean);
+    fusion.addInput(rstd);
+    fusion.addInput(weight);
+    fusion.addInput(bias);
+
+    if (dtype == DataType::Half) {
+      grad_out = castOp(DataType::Float, grad_out);
+      input = castOp(DataType::Float, input);
+      weight = castOp(DataType::Float, weight);
+      bias = castOp(DataType::Float, bias);
+    }
+
+    auto layer_norm_results = layer_norm_backward(
+        grad_out,
+        input,
+        norm_shape,
+        mean,
+        rstd,
+        weight,
+        bias,
+        {true, true, true});
+
+    if (dtype == DataType::Half) {
+      layer_norm_results.grad_input =
+          castOp(dtype, layer_norm_results.grad_input);
+      layer_norm_results.grad_bias =
+          castOp(dtype, layer_norm_results.grad_bias);
+      layer_norm_results.grad_weight =
+          castOp(dtype, layer_norm_results.grad_weight);
+    }
+    // link inner and outer reduction or outer and outer reduction
+    auto out_linked = link_inner_outer
+        ? add(layer_norm_results.grad_input, layer_norm_results.grad_weight)
+        : add(layer_norm_results.grad_bias, layer_norm_results.grad_weight);
+
+    fusion.addOutput(out_linked);
+    fusion.addOutput(layer_norm_results.grad_input);
+    fusion.addOutput(layer_norm_results.grad_weight);
+    fusion.addOutput(layer_norm_results.grad_bias);
+
+    auto maybe_fp16_options = at::TensorOptions()
+                                  .dtype(data_type_to_aten(dtype))
+                                  .device(at::kCUDA, 0);
+    at::Tensor aten_grad_out = at::randn(input_shape, maybe_fp16_options);
+    at::Tensor aten_input = at::randn(input_shape, maybe_fp16_options);
+    at::Tensor aten_weight = at::randn(norm_shape, maybe_fp16_options);
+    at::Tensor aten_bias = at::randn(norm_shape, maybe_fp16_options);
+    auto at_weight = c10::optional<at::Tensor>(aten_weight);
+    auto at_bias = c10::optional<at::Tensor>(aten_bias);
+
+    const float kEps = 1e-5;
+    auto aten_results =
+        at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
+    auto aten_output = std::get<0>(aten_results);
+    auto aten_mean = std::get<1>(aten_results);
+    auto aten_rstd = std::get<2>(aten_results);
+
+    FusionExecutorCache fec(std::move(fusion_ptr));
+    std::vector<c10::IValue> aten_inputs = {
+        aten_grad_out,
+        aten_input,
+        aten_mean,
+        aten_rstd,
+        aten_weight,
+        aten_bias};
+    auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+    auto aten_gradients = at::native_layer_norm_backward(
+        aten_grad_out,
+        aten_input,
+        norm_shape,
+        aten_mean,
+        aten_rstd,
+        c10::optional<at::Tensor>(aten_weight),
+        c10::optional<at::Tensor>(aten_bias),
+        {true, true, true});
+
+    auto aten_out_linked = link_inner_outer
+        ? std::get<0>(aten_gradients) + std::get<1>(aten_gradients)
+        : std::get<1>(aten_gradients) + std::get<2>(aten_gradients);
+
+    bool is_segmented = fec.getMostRecentKernelRuntime()->isSegmented();
+    TORCH_CHECK(is_segmented, "Fusion is not segmented");
+
+    testValidate(
+        &fusion,
+        cg_outputs,
+        aten_inputs,
+        {aten_out_linked,
+         std::get<0>(aten_gradients),
+         std::get<1>(aten_gradients),
+         std::get<2>(aten_gradients)},
+        __LINE__,
+        __FILE__);
+  };
+
+  DataType dtype = DataType::Float;
+  std::vector<int64_t> batch_shape = {8192};
+  std::vector<int64_t> norm_shape = {2048};
+  runTest(batch_shape, norm_shape, dtype, true);
+  runTest(batch_shape, norm_shape, dtype, false);
+}
+
+// This case is to test the correctness of the combined inner and outer
+// scheduler. One tensor is using the inner reduction results and outer
+// reduction results. should be segmented.
+TEST_F(NVFuserTest, FusionCombinedSchedulerSharedProducer_CUDA) {
+  auto runTest = [](const std::vector<int64_t>& batch_shape,
+                    const std::vector<int64_t>& norm_shape,
+                    DataType dtype,
+                    int case_id) {
+    std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+    Fusion& fusion = *fusion_ptr.get();
+    FusionGuard fg(&fusion);
+
+    std::vector<int64_t> input_shape(batch_shape);
+    std::copy(
+        norm_shape.begin(), norm_shape.end(), std::back_inserter(input_shape));
+
+    const size_t kM = input_shape.size();
+    const size_t kN = norm_shape.size();
+    const size_t kOuterNumDims = kM - kN;
+    std::vector<int64_t> outer_shape;
+    for (const auto idx : c10::irange(kOuterNumDims)) {
+      outer_shape.push_back(input_shape[idx]);
+    }
+    for (const auto idx : c10::irange(kOuterNumDims, kM)) {
+      // just to avoid unused variable warning
+      outer_shape.push_back(1 + idx - idx);
+    }
+
+    auto grad_out = makeContigTensor(input_shape.size(), dtype);
+    auto input = makeContigTensor(input_shape.size(), dtype);
+    auto mean = makeConcreteTensor(
+        outer_shape, dtype == DataType::Half ? DataType::Float : dtype);
+    auto rstd = makeConcreteTensor(
+        outer_shape, dtype == DataType::Half ? DataType::Float : dtype);
+    auto weight = makeContigTensor(norm_shape.size(), dtype);
+    auto bias = makeContigTensor(norm_shape.size(), dtype);
+    fusion.addInput(grad_out);
+    fusion.addInput(input);
+    fusion.addInput(mean);
+    fusion.addInput(rstd);
+    fusion.addInput(weight);
+    fusion.addInput(bias);
+
+    if (dtype == DataType::Half) {
+      grad_out = castOp(DataType::Float, grad_out);
+      input = castOp(DataType::Float, input);
+      weight = castOp(DataType::Float, weight);
+      bias = castOp(DataType::Float, bias);
+    }
+
+    auto layer_norm_results = layer_norm_backward(
+        grad_out,
+        input,
+        norm_shape,
+        mean,
+        rstd,
+        weight,
+        bias,
+        {true, true, true});
+
+    if (dtype == DataType::Half) {
+      layer_norm_results.grad_input =
+          castOp(dtype, layer_norm_results.grad_input);
+      layer_norm_results.grad_bias =
+          castOp(dtype, layer_norm_results.grad_bias);
+      layer_norm_results.grad_weight =
+          castOp(dtype, layer_norm_results.grad_weight);
+    }
+
+    switch (case_id) {
+      case 0: {
+        // tensor input is a produer of a consumer of the inner and outer
+        // reduction results this a not allowed, expect segmented
+        auto use_inner = add(layer_norm_results.grad_input, input);
+        auto use_outer = add(layer_norm_results.grad_weight, input);
+        fusion.addOutput(use_inner);
+        fusion.addOutput(use_outer);
+      } break;
+      case 1: {
+        // tensor bias is a producer of the inner reduction and also a produer
+        // of a consumer of the outer reduction results this a not allowed,
+        // expect segmented
+        auto bias_broad = add(bias, mean);
+        auto use_inner = sum(bias_broad, {-1});
+        auto use_outer = add(layer_norm_results.grad_weight, bias);
+        fusion.addOutput(use_inner);
+        fusion.addOutput(use_outer);
+      } break;
+      case 2: {
+        // tensor bias is a producer of the outer reduction and also a produer
+        // of a consumer of the inner reduction results this a allowed, becase
+        // the first part of outer reduction is computed with inner reduction.
+        // expect unsegmented
+        auto bias_broad = add(bias, mean);
+        auto use_inner = add(layer_norm_results.grad_input, bias_broad);
+        auto use_outer = sum(bias_broad, {0});
+        fusion.addOutput(use_inner);
+        fusion.addOutput(use_outer);
+      } break;
+      case 3: {
+        // tensor bias is a producer of the two outer reductions' consumers,
+        // expect segmented
+        auto outer_1_consumer =
+            add(layer_norm_results.grad_weight, IrBuilder::create<Double>(1));
+        auto outer_2_consumer =
+            add(layer_norm_results.grad_bias, IrBuilder::create<Double>(1));
+        auto use_producer_1 = add(outer_1_consumer, bias);
+        auto use_producer_2 = add(outer_2_consumer, bias);
+        fusion.addOutput(use_producer_1);
+        fusion.addOutput(use_producer_2);
+      } break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "Invalid case id");
+    }
+
+    fusion.addOutput(layer_norm_results.grad_input);
+    fusion.addOutput(layer_norm_results.grad_weight);
+    fusion.addOutput(layer_norm_results.grad_bias);
+
+    auto maybe_fp16_options = at::TensorOptions()
+                                  .dtype(data_type_to_aten(dtype))
+                                  .device(at::kCUDA, 0);
+    at::Tensor aten_grad_out = at::randn(input_shape, maybe_fp16_options);
+    at::Tensor aten_input = at::randn(input_shape, maybe_fp16_options);
+    at::Tensor aten_weight = at::randn(norm_shape, maybe_fp16_options);
+    at::Tensor aten_bias = at::randn(norm_shape, maybe_fp16_options);
+    auto at_weight = c10::optional<at::Tensor>(aten_weight);
+    auto at_bias = c10::optional<at::Tensor>(aten_bias);
+
+    const float kEps = 1e-5;
+    auto aten_results =
+        at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
+    auto aten_output = std::get<0>(aten_results);
+    auto aten_mean = std::get<1>(aten_results);
+    auto aten_rstd = std::get<2>(aten_results);
+
+    FusionExecutorCache fec(std::move(fusion_ptr));
+    std::vector<c10::IValue> aten_inputs = {
+        aten_grad_out,
+        aten_input,
+        aten_mean,
+        aten_rstd,
+        aten_weight,
+        aten_bias};
+    auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+    auto aten_gradients = at::native_layer_norm_backward(
+        aten_grad_out,
+        aten_input,
+        norm_shape,
+        aten_mean,
+        aten_rstd,
+        c10::optional<at::Tensor>(aten_weight),
+        c10::optional<at::Tensor>(aten_bias),
+        {true, true, true});
+
+    // check the results depending on the case
+    at::Tensor aten_use_inner, aten_use_outer;
+    bool expected_segmented;
+    switch (case_id) {
+      case 0: {
+        aten_use_inner = std::get<0>(aten_gradients) + aten_input;
+        aten_use_outer = std::get<1>(aten_gradients) + aten_input;
+        expected_segmented = true;
+      } break;
+      case 1: {
+        aten_use_inner = (aten_bias + aten_mean).sum({-1});
+        aten_use_outer = std::get<1>(aten_gradients) + aten_bias;
+        expected_segmented = true;
+      } break;
+      case 2: {
+        aten_use_inner = std::get<0>(aten_gradients) + (aten_bias + aten_mean);
+        aten_use_outer = (aten_bias + aten_mean).sum({0});
+        expected_segmented = false;
+      } break;
+      case 3: {
+        aten_use_inner = std::get<1>(aten_gradients) + (aten_bias + 1.0);
+        aten_use_outer = std::get<2>(aten_gradients) + (aten_bias + 1.0);
+        expected_segmented = true;
+      } break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "Invalid case id");
+    }
+    bool is_segmented = fec.getMostRecentKernelRuntime()->isSegmented();
+    TORCH_CHECK(
+        is_segmented == expected_segmented,
+        expected_segmented ? "Fusion should be segmented!"
+                           : "Fusion should not be segmented!");
+
+    testValidate(
+        &fusion,
+        cg_outputs,
+        aten_inputs,
+        {aten_use_inner,
+         aten_use_outer,
+         std::get<0>(aten_gradients),
+         std::get<1>(aten_gradients),
+         std::get<2>(aten_gradients)},
+        __LINE__,
+        __FILE__);
+  };
+
+  DataType dtype = DataType::Float;
+  // to test hasSharedConsumerNonReductionProducer, needs to use small sizes,
+  // otherwise this fusion will be rejected due to register usage.
+  std::vector<int64_t> batch_shape = {64};
+  std::vector<int64_t> norm_shape = {32};
+  for (int i = 0; i < 4; i++) {
+    runTest(batch_shape, norm_shape, dtype, i);
+  }
+}
+
+// Manual schedule of inner and outer reduction on the same tensor
+TEST_F(NVFuserTest, FusionCombinedReduction_CUDA) {
+  // https://github.com/csarofeen/pytorch/issues/2566
+  // this case will fail, if using tidx = 8 and tidy = 64
+  // for inner reduction, tidy is derived as 10240 / (tidx*vecx*nloadx) = 64
+  // for outer reduction, tidy is derived as 216  / nloady = 54
+  // the kernel will be launched with bdimy = 64
+  // in the generated kernel, all these 64 threads are attending the block
+  // reduction but only 54 of them have valid initial values. thus the result is
+  // polluted by other 10 threads and can't pass the validation. to avoid this
+  // issue, we can use one of the following methods: (1) make sure tidy derived
+  // from inner reduction & outer reduction is same (when 216 % tidy == 0) or
+  // (2) instead of split outer reduction tensor with nloady, split it with
+  // bdimy. The current scheduler is using method-2.
+
+  auto ceilDiv = [](const int a, const int b) { return (a + b - 1) / b; };
+  constexpr bool verbose = false;
+  const auto dev_prop = at::cuda::getCurrentDeviceProperties();
+  const int64_t device_multiprocessor_count =
+      (int64_t)dev_prop->multiProcessorCount;
+  const int dim0 = 2048;
+  const int dim1 = 10240;
+  const int tidx = 64;
+  const int tidy = 8;
+  const int bidy = 2 * device_multiprocessor_count; // 216
+  const int vecx = 4;
+  const int nloadx =
+      ceilDiv(dim1, vecx * tidx * tidy); // 5, simulate persistent buffer
+  const int nloady = ceilDiv(bidy, tidy); // 216/16=13.5 -> 14
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = sum(tv0, {1});
+  TensorView* tv2 = sum(tv0, {0});
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+
+  auto cached_inputs = scheduler_utils::cacheInputs(&fusion, true);
+  auto cached_outputs = scheduler_utils::cacheAndForkOutputs(&fusion, true);
+  auto reduction_tvs = scheduler_utils::getReductionTvs(&fusion);
+  scheduler_utils::clearMemorySpace(&fusion);
+  std::vector<TensorView*> inner_reduction_tvs, outer_reduction_tvs;
+  for (auto tv : reduction_tvs) {
+    if (scheduler_utils::isFastestDimReduction(tv)) {
+      inner_reduction_tvs.emplace_back(tv);
+    } else {
+      outer_reduction_tvs.emplace_back(tv);
+    }
+    if (verbose)
+      std::cout << "tv= " << tv->toString() << ", fastest_dim_reduction= "
+                << scheduler_utils::isFastestDimReduction(tv) << std::endl;
+  }
+  TensorView* inner_reduction_tv = inner_reduction_tvs[0];
+  TensorView* outer_reduction_tv = outer_reduction_tvs[0];
+
+  inner_reduction_tv->split(-1, vecx);
+  inner_reduction_tv->split(-2, tidx);
+  inner_reduction_tv->split(-3, nloadx, false);
+  inner_reduction_tv->split(0, bidy, false);
+  inner_reduction_tv->axis(0)->parallelize(ParallelType::BIDy);
+  inner_reduction_tv->axis(-3)->parallelize(ParallelType::TIDy);
+  inner_reduction_tv->axis(-2)->parallelize(ParallelType::TIDx);
+  inner_reduction_tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  if (verbose)
+    std::cout << "inner_reduction_tv " << inner_reduction_tv->toString()
+              << std::endl;
+  auto reference_tv_inner =
+      reduction_scheduler_utils::sortAndRFactor(inner_reduction_tv);
+  if (verbose)
+    std::cout << "reference_tv_inner " << reference_tv_inner->toString()
+              << std::endl;
+
+  outer_reduction_tv->split(0, bidy, false);
+  auto partialResult = outer_reduction_tv->rFactor({1});
+  partialResult->cacheBefore();
+  partialResult->setMemoryType(MemoryType::Global);
+  auto partialResultReload = partialResult->cacheAfter();
+
+  outer_reduction_tv->split(0, nloady, false);
+  outer_reduction_tv->split(-1, tidx);
+  outer_reduction_tv->split(-2, bidy);
+  outer_reduction_tv->axis(1)->parallelize(ParallelType::TIDy);
+  outer_reduction_tv->axis(-2)->parallelize(ParallelType::BIDy);
+  outer_reduction_tv->axis(-1)->parallelize(ParallelType::TIDx);
+
+  if (verbose)
+    std::cout << "outer_reduction_tv " << outer_reduction_tv->toString()
+              << std::endl;
+  auto reference_tv_outer =
+      reduction_scheduler_utils::sortAndRFactor(outer_reduction_tv);
+  if (verbose)
+    std::cout << "reference_tv_outer " << reference_tv_outer->toString()
+              << std::endl;
+
+  reduction_scheduler_utils::propagateTransformation(
+      reference_tv_inner, {partialResultReload});
+  reduction_scheduler_utils::propagateTransformation(
+      reference_tv_outer, {partialResultReload});
+
+  std::vector<TensorView*> cached_gmem_temp{partialResult};
+  // cached_gmem is float, may use a different vectorization factor
+  for (auto tv : cached_gmem_temp) {
+    tv->split(-1, 4);
+    tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  }
+
+  reduction_scheduler_utils::propagateParallelization(
+      &fusion,
+      inner_reduction_tv,
+      reference_tv_inner,
+      true,
+      true,
+      false,
+      inner_reduction_tvs,
+      cached_inputs,
+      cached_outputs);
+  reduction_scheduler_utils::propagateParallelization(
+      &fusion,
+      outer_reduction_tv,
+      reference_tv_outer,
+      true,
+      true,
+      false,
+      outer_reduction_tvs,
+      cached_inputs,
+      cached_outputs);
+
+  inlineMost();
+  LaunchParams launch_constraints;
+  constexpr int64_t maxrregcount = 64;
+  CompileParams compile_params{DataType::Int, maxrregcount, true};
+  if (verbose)
+    fusion.print();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor tv_input = at::randn({dim0, dim1}, options);
+  auto tv_aten_output = tv_input.to(at::kFloat).sum({1});
+  at::Tensor tv_cg_output = at::empty({dim0}, options);
+
+  at::Tensor qv_cg_output = at::empty({dim1}, options);
+  auto qv_aten_output = tv_input.to(at::kFloat).sum({0});
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params);
+  fe.runFusion(
+      {tv_input},
+      {tv_cg_output, qv_cg_output},
+      launch_constraints,
+      compile_params);
+
+  testValidate(
+      &fusion,
+      {tv_cg_output, qv_cg_output},
+      {tv_input},
+      {tv_aten_output, qv_aten_output},
+      __LINE__,
+      __FILE__);
+}
+
+// Manual schedule of inner and outer reduction on the same tensor. Each block
+// will do multiple reductions.
+TEST_F(NVFuserTest, FusionCombinedReductionMultiPerBlock_CUDA) {
+  auto ceilDiv = [](const int a, const int b) { return (a + b - 1) / b; };
+  constexpr bool verbose = false;
+  const auto dev_prop = at::cuda::getCurrentDeviceProperties();
+  const int64_t device_multiprocessor_count =
+      (int64_t)dev_prop->multiProcessorCount;
+  const int dim0 = 216;
+  const int dim1 = 1024;
+  const int bidy = 2 * device_multiprocessor_count;
+  const int vecx = 4;
+  const int nloadx = 8;
+  const int tidx = dim1 / vecx / nloadx;
+  const int tidy = ceilDiv(dim1, bidy);
+  // https://github.com/csarofeen/pytorch/issues/2458
+  const bool swap_xy = true;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = sum(tv0, {1});
+  TensorView* tv2 = sum(tv0, {0});
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+
+  auto cached_inputs = scheduler_utils::cacheInputs(&fusion, true);
+  auto cached_outputs = scheduler_utils::cacheAndForkOutputs(&fusion, true);
+  auto reduction_tvs = scheduler_utils::getReductionTvs(&fusion);
+  scheduler_utils::clearMemorySpace(&fusion);
+  std::vector<TensorView*> inner_reduction_tvs, outer_reduction_tvs;
+  for (auto tv : reduction_tvs) {
+    if (scheduler_utils::isFastestDimReduction(tv)) {
+      inner_reduction_tvs.emplace_back(tv);
+    } else {
+      outer_reduction_tvs.emplace_back(tv);
+    }
+    if (verbose)
+      std::cout << "tv= " << tv->toString() << ", fastest_dim_reduction= "
+                << scheduler_utils::isFastestDimReduction(tv) << std::endl;
+  }
+  TensorView* inner_reduction_tv = inner_reduction_tvs[0];
+  TensorView* outer_reduction_tv = outer_reduction_tvs[0];
+
+  inner_reduction_tv->split(-1, vecx);
+  inner_reduction_tv->split(-2, nloadx, false);
+  inner_reduction_tv->split(0, tidy);
+  inner_reduction_tv->split(0, bidy, false);
+  // bidy, i0/tidy/bidy, tidy
+
+  inner_reduction_tv->axis(0)->parallelize(ParallelType::BIDy);
+  inner_reduction_tv->axis(1)->parallelize(ParallelType::Serial);
+  inner_reduction_tv->axis(2)->parallelize(ParallelType::TIDy);
+  inner_reduction_tv->axis(-2)->parallelize(ParallelType::TIDx);
+  inner_reduction_tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  if (verbose)
+    std::cout << "inner_reduction_tv " << inner_reduction_tv->toString()
+              << std::endl;
+  auto reference_tv_inner =
+      reduction_scheduler_utils::sortAndRFactor(inner_reduction_tv);
+  if (verbose)
+    std::cout << "reference_tv_inner " << reference_tv_inner->toString()
+              << std::endl;
+
+  // outer_reduction_tv->split(0, bidy, false);
+  // auto partialResult = outer_reduction_tv->rFactor({1});
+  std::vector<int32_t> rfactor_axis = {1, 2};
+
+  outer_reduction_tv->split(0, tidy);
+  outer_reduction_tv->split(0, bidy, false);
+  outer_reduction_tv->rFactor({1});
+  TensorView* partialResult = outer_reduction_tv->rFactor({1});
+
+  if (verbose)
+    std::cout << "outer_reduction_tv " << outer_reduction_tv->toString()
+              << std::endl;
+
+  partialResult->cacheBefore();
+  partialResult->setMemoryType(MemoryType::Global);
+  auto partialResultReload = partialResult->cacheAfter();
+
+  if (swap_xy) {
+    outer_reduction_tv->split(0, tidx);
+    outer_reduction_tv->split(-1, tidy);
+    outer_reduction_tv->split(-2, bidy);
+    outer_reduction_tv->axis(1)->parallelize(ParallelType::TIDx);
+    outer_reduction_tv->axis(-2)->parallelize(ParallelType::BIDy);
+    outer_reduction_tv->axis(-1)->parallelize(ParallelType::TIDy);
+  } else {
+    outer_reduction_tv->split(0, tidy);
+    outer_reduction_tv->split(-1, tidx);
+    outer_reduction_tv->split(-2, bidy);
+    outer_reduction_tv->axis(1)->parallelize(ParallelType::TIDy);
+    outer_reduction_tv->axis(-2)->parallelize(ParallelType::BIDy);
+    outer_reduction_tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+  if (verbose)
+    std::cout << "outer_reduction_tv " << outer_reduction_tv->toString()
+              << std::endl;
+  auto reference_tv_outer =
+      reduction_scheduler_utils::sortAndRFactor(outer_reduction_tv);
+  if (verbose)
+    std::cout << "reference_tv_outer " << reference_tv_outer->toString()
+              << std::endl;
+
+  reduction_scheduler_utils::propagateTransformation(
+      reference_tv_inner, {partialResultReload});
+  reduction_scheduler_utils::propagateParallelization(
+      &fusion,
+      inner_reduction_tv,
+      reference_tv_inner,
+      true,
+      true,
+      false,
+      inner_reduction_tvs,
+      cached_inputs,
+      cached_outputs);
+
+  reduction_scheduler_utils::propagateTransformation(
+      reference_tv_outer, {partialResultReload});
+  reduction_scheduler_utils::propagateParallelization(
+      &fusion,
+      outer_reduction_tv,
+      reference_tv_outer,
+      true,
+      true,
+      false,
+      outer_reduction_tvs,
+      cached_inputs,
+      cached_outputs);
+
+  std::vector<TensorView*> cached_gmem_temp{partialResult};
+  for (auto tv : cached_gmem_temp) {
+    tv->split(-1, 4);
+    tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  }
+
+  inlineMost();
+  LaunchParams launch_constraints;
+  constexpr int64_t maxrregcount = 64;
+  CompileParams compile_params{DataType::Int, maxrregcount, true};
+  if (verbose)
+    fusion.print();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor tv_input = at::ones({dim0, dim1}, options);
+  auto tv_aten_output = tv_input.to(at::kFloat).sum({1});
+  at::Tensor tv_cg_output = at::empty({dim0}, options);
+
+  at::Tensor qv_cg_output = at::empty({dim1}, options);
+  at::Tensor tv_input2 = at::ones({dim0, dim1}, options);
+  auto qv_aten_output = tv_input2.to(at::kFloat).sum({0});
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params);
+  fe.runFusion(
+      {tv_input},
+      {tv_cg_output, qv_cg_output},
+      launch_constraints,
+      compile_params);
+
+  testValidate(
+      &fusion,
+      {tv_cg_output, qv_cg_output},
+      {tv_input},
+      {tv_aten_output, qv_aten_output},
+      __LINE__,
+      __FILE__);
+}
+
+} // namespace nvfuser

From 6a889603313b45830ce988f33c1ed53adb184e96 Mon Sep 17 00:00:00 2001
From: Liqiang Lu <liqiangxl@gmail.com>
Date: Sat, 22 Apr 2023 11:47:25 -0700
Subject: [PATCH 2/4] rename test

---
 CMakeLists.txt                                         |  2 +-
 ...ion.cpp => test_combined_inner_outer_reduction.cpp} | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)
 rename test/{test_gpu_combined_inner_outer_reduction.cpp => test_combined_inner_outer_reduction.cpp} (99%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53195ef3985..2ec3a0dbf5e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -357,7 +357,7 @@ if(BUILD_TEST)
     ${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp
     ${NVFUSER_ROOT}/test/test_gpu_multidevice.cpp
     ${NVFUSER_ROOT}/test/test_multicluster_fusion.cpp
-    ${NVFUSER_ROOT}/test/test_gpu_combined_inner_outer_reduction.cpp
+    ${NVFUSER_ROOT}/test/test_combined_inner_outer_reduction.cpp
   )
   list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)
 
diff --git a/test/test_gpu_combined_inner_outer_reduction.cpp b/test/test_combined_inner_outer_reduction.cpp
similarity index 99%
rename from test/test_gpu_combined_inner_outer_reduction.cpp
rename to test/test_combined_inner_outer_reduction.cpp
index d2e44287dc2..ca88f35b1d4 100644
--- a/test/test_gpu_combined_inner_outer_reduction.cpp
+++ b/test/test_combined_inner_outer_reduction.cpp
@@ -42,7 +42,7 @@ std::tuple<float, float> getMeanVar(const std::vector<float>& v) {
 // This case is to test the correctness of the combined inner and outer
 // scheduler used in layer norm backward. It can also be configured to test the
 // performance using different data types.
-TEST_F(NVFuserTest, FusionCombinedSchedulerLayerNormBackward_CUDA) {
+TEST_F(NVFuserTest, CombinedSchedulerLayerNormBackward_CUDA) {
   auto runTest = [](const std::vector<int64_t>& batch_shape,
                     const std::vector<int64_t>& norm_shape,
                     DataType dtype,
@@ -274,7 +274,7 @@ TEST_F(NVFuserTest, FusionCombinedSchedulerLayerNormBackward_CUDA) {
 // fusion should be segmented since the current combined scheduler assumes there
 // is no shared consumer between inter reductions and outer reductions and among
 // tensors in outer reductions.
-TEST_F(NVFuserTest, FusionCombinedSchedulerSharedConsumer_CUDA) {
+TEST_F(NVFuserTest, CombinedSchedulerSharedConsumer_CUDA) {
   auto runTest = [](const std::vector<int64_t>& batch_shape,
                     const std::vector<int64_t>& norm_shape,
                     DataType dtype,
@@ -415,7 +415,7 @@ TEST_F(NVFuserTest, FusionCombinedSchedulerSharedConsumer_CUDA) {
 // This case is to test the correctness of the combined inner and outer
 // scheduler. One tensor is using the inner reduction results and outer
 // reduction results. should be segmented.
-TEST_F(NVFuserTest, FusionCombinedSchedulerSharedProducer_CUDA) {
+TEST_F(NVFuserTest, CombinedSchedulerSharedProducer_CUDA) {
   auto runTest = [](const std::vector<int64_t>& batch_shape,
                     const std::vector<int64_t>& norm_shape,
                     DataType dtype,
@@ -625,7 +625,7 @@ TEST_F(NVFuserTest, FusionCombinedSchedulerSharedProducer_CUDA) {
 }
 
 // Manual schedule of inner and outer reduction on the same tensor
-TEST_F(NVFuserTest, FusionCombinedReduction_CUDA) {
+TEST_F(NVFuserTest, CombinedReduction_CUDA) {
   // https://github.com/csarofeen/pytorch/issues/2566
   // this case will fail, if using tidx = 8 and tidy = 64
   // for inner reduction, tidy is derived as 10240 / (tidx*vecx*nloadx) = 64
@@ -786,7 +786,7 @@ TEST_F(NVFuserTest, FusionCombinedReduction_CUDA) {
 
 // Manual schedule of inner and outer reduction on the same tensor. Each block
 // will do multiple reductions.
-TEST_F(NVFuserTest, FusionCombinedReductionMultiPerBlock_CUDA) {
+TEST_F(NVFuserTest, CombinedReductionMultiPerBlock_CUDA) {
   auto ceilDiv = [](const int a, const int b) { return (a + b - 1) / b; };
   constexpr bool verbose = false;
   const auto dev_prop = at::cuda::getCurrentDeviceProperties();

From 988b07e22200c49a0be4207b454a451b756dc506 Mon Sep 17 00:00:00 2001
From: Liqiang Lu <liqiangxl@gmail.com>
Date: Tue, 25 Apr 2023 06:55:34 -0700
Subject: [PATCH 3/4] reduce test size

---
 test/test_combined_inner_outer_reduction.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_combined_inner_outer_reduction.cpp b/test/test_combined_inner_outer_reduction.cpp
index ca88f35b1d4..0b2a7ea6090 100644
--- a/test/test_combined_inner_outer_reduction.cpp
+++ b/test/test_combined_inner_outer_reduction.cpp
@@ -231,9 +231,9 @@ TEST_F(NVFuserTest, CombinedSchedulerLayerNormBackward_CUDA) {
   };
 
   std::vector<DataType> data_types = {DataType::Half, DataType::Float};
-  std::vector<std::vector<int64_t>> batch_sizes = {{8, 1024}};
+  std::vector<std::vector<int64_t>> batch_sizes = {{216}};
   std::vector<std::vector<int64_t>> hidden_sizes = {
-      {2048}, {576}, {768}, {1024}, {1280}, {1600}};
+      {576}, {768}, {1024}, {1280}, {1600}};
 
   bool isBenchmark = false;
   bool onlyTestFirstCase = false;

From a097f7777494d7e947d81424b5b4dbcd8577180b Mon Sep 17 00:00:00 2001
From: Liqiang Lu <liqiangxl@gmail.com>
Date: Tue, 25 Apr 2023 08:55:33 -0700
Subject: [PATCH 4/4] bump tolerance

---
 csrc/scheduler/registry.cpp                  |  2 +-
 test/test_combined_inner_outer_reduction.cpp | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
index f4864317cab..cf9665e918d 100644
--- a/csrc/scheduler/registry.cpp
+++ b/csrc/scheduler/registry.cpp
@@ -1899,7 +1899,7 @@ class PersistentKernelScheduler : public SchedulerEntry {
     auto& reduction_tvs = reduction_tv_entry.get();
     bool inner_reduction = false;
     bool outer_reduction = false;
-    TensorView* first_inner_reduction_tv;
+    TensorView* first_inner_reduction_tv = nullptr;
     for (auto tv : reduction_tvs) {
       if (scheduler_utils::isFastestDimReduction(tv)) {
         first_inner_reduction_tv = tv;
diff --git a/test/test_combined_inner_outer_reduction.cpp b/test/test_combined_inner_outer_reduction.cpp
index 0b2a7ea6090..325676fcd93 100644
--- a/test/test_combined_inner_outer_reduction.cpp
+++ b/test/test_combined_inner_outer_reduction.cpp
@@ -601,6 +601,14 @@ TEST_F(NVFuserTest, CombinedSchedulerSharedProducer_CUDA) {
         expected_segmented ? "Fusion should be segmented!"
                            : "Fusion should not be segmented!");
 
+    auto tolerance_overwrite = ValidationConstants();
+    // bump tolerance, CI errors are higher than local
+    std::array<std::array<double, 2>, 20> relaxed_sum_tol;
+    for (auto& arr : relaxed_sum_tol) {
+      arr = {128, 2e-5};
+    }
+    tolerance_overwrite.sum_tolerances_float = relaxed_sum_tol;
+
     testValidate(
         &fusion,
         cg_outputs,
@@ -611,7 +619,10 @@ TEST_F(NVFuserTest, CombinedSchedulerSharedProducer_CUDA) {
          std::get<1>(aten_gradients),
          std::get<2>(aten_gradients)},
         __LINE__,
-        __FILE__);
+        __FILE__,
+        "",
+        LaunchParams(),
+        tolerance_overwrite);
   };
 
   DataType dtype = DataType::Float;