Llu/ln bwd (#207)

combines inner and outer reduction in one kernel in cases where at least one input is used by inner and outer reductions, e.g. layer norm backward.
NVIDIA · Apr 25, 2023 · 0250132 · 0250132
1 parent 8884858
commit 0250132
Show file tree

Hide file tree

Showing 19 changed files with 2,311 additions and 176 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -357,6 +357,7 @@ if(BUILD_TEST)
     ${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp
     ${NVFUSER_ROOT}/test/test_gpu_multidevice.cpp
     ${NVFUSER_ROOT}/test/test_multicluster_fusion.cpp
+    ${NVFUSER_ROOT}/test/test_combined_inner_outer_reduction.cpp
   )
   list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)
 

diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp
@@ -998,26 +998,11 @@ c10::optional<int> getMaxRegCount(
   // If the block size is known, set the maximum that at least allows
   // one block to be resident on an SM
   if (opt_block_size.has_value() && opt_block_size.value() > 0) {
-    int num_partition = 0;
-    int reg_allocation_granularity = 0;
-    const auto prop = at::cuda::getCurrentDeviceProperties();
-    cudaOccDeviceProp occ_prop(*prop);
-    cudaOccSubPartitionsPerMultiprocessor(&num_partition, &occ_prop);
-    cudaOccRegAllocationGranularity(&reg_allocation_granularity, &occ_prop);
-    int warp_size = prop->warpSize;
-    int64_t num_warps = ceilDiv(opt_block_size.value(), warp_size);
-
-    // warps could be distributed unevenly across partition
-    int64_t max_warps_per_sm_partition = ceilDiv(num_warps, num_partition);
-    // registers are evenly distributed across partitions, partition with most
-    // wraps determins the maximum register available per warp
-    int max_reg_per_warp =
-        prop->regsPerBlock / num_partition / (int)max_warps_per_sm_partition;
-    // clamp down to register allocation granularity at warp level
-    int effective_max_reg_per_warp = max_reg_per_warp /
-        reg_allocation_granularity * reg_allocation_granularity;
-    max_register =
-        std::min(max_register_limit, effective_max_reg_per_warp / warp_size);
+    constexpr int block_per_sm = 1;
+    max_register = std::min(
+        max_register_limit,
+        (int)getRegPerThreadGivenThreadsPerSM(
+            opt_block_size.value() * block_per_sm));
   }
 
   // If a heuristic value is given, i.e., max_register_heuristic is

diff --git a/csrc/ir_internal_nodes.h b/csrc/ir_internal_nodes.h
@@ -1498,6 +1498,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
     return getIterType() == IterType::Reduction;
   }
 
+  bool isIteration() const {
+    return getIterType() == IterType::Iteration;
+  }
+
   bool isRFactorProduct() const {
     return is_rfactor_domain_;
   }

diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
@@ -466,7 +466,8 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
     most_recent_executor_log_.params = scheduler_entry->params()->clone();
   }
 
-  if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
+  if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose) ||
+      measure_kernel_time_) {
     executor.setMeasureKernelTimeFlag(true);
   }
 

diff --git a/csrc/kernel_cache.h b/csrc/kernel_cache.h
@@ -99,6 +99,10 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
     profiling_ = to_profile;
   }
 
+  void setMeasureKernelTime(bool val = true) {
+    measure_kernel_time_ = val;
+  }
+
   //! Internal knob for profiling shape inference
   void disableLaunchParamCache() {
     for (auto& executor : executors_) {
@@ -230,6 +234,7 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
 
   // States for profiling support
   bool profiling_ = false;
+  bool measure_kernel_time_ = false;
 
   std::mutex mutex_;
 

diff --git a/csrc/maxinfo_propagator.h b/csrc/maxinfo_propagator.h
@@ -280,4 +280,24 @@ class TORCH_CUDA_CU_API SetSelector : public MaxInfoSpanningTree::Selector {
   }
 };
 
+// Simple selector to allow different parallel patterns in the fusion.
+// The propagation is blocked at boundaryNodesSet.
+// For P2C forward propagate, disable propagation to tensorViews in
+// boundaryNodesSet. For C2P backward propagate, disable propagation from
+// tensorViews in boundaryNodesSet
+struct InternalBoundarySelector : public MaxInfoSpanningTree::Selector {
+  std::unordered_set<TensorView*> tvs_;
+  virtual bool allowC2P(TensorView* from, TensorView* to) override {
+    return tvs_.count(from) == 0;
+  };
+  virtual bool allowP2C(TensorView* from, TensorView* to) override {
+    return tvs_.count(to) == 0;
+  };
+  virtual bool allowSibling(TensorView* from, TensorView* to) override {
+    return true;
+  }
+  InternalBoundarySelector(const std::unordered_set<TensorView*>& tvs)
+      : tvs_(tvs) {}
+};
+
 } // namespace nvfuser