csarofeen · csarofeen · May 8, 2022 · May 10, 2022 · May 10, 2022
diff --git a/benchmarks/cpp/nvfuser/timm.cpp b/benchmarks/cpp/nvfuser/timm.cpp
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -1342,6 +1342,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     }
     // Init val
     func_args.arg(genCall(data_type, genInline(grop->init())));
+    func_args.arg(genInline(grop->entrance_index()));
+    func_args.arg(genInline(grop->entrances()));
 
     indent() << "reduction::gridReduce<" << template_args << ">(\n";
     indent() << kTab << func_args << ");\n";
@@ -1658,7 +1660,10 @@ class CudaKernelGenerator : private OptOutConstDispatch {
       indent() << kTab << read_pred << ",\n";
     }
     // TODO : init value support or remove.
-    indent() << kTab << data_type << "(0));\n";
+    indent() << kTab << data_type << "(0),\n";
+    indent() << kTab << genInline(gwop->entrance_index()) << ",\n";
+    indent() << kTab << genInline(gwop->entrances());
+    code_ << ");\n";
   }
 
   void generateGridAllreduce(const kir::GridWelford* gwop) {

diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp
@@ -115,15 +115,17 @@ class KernelIrScanner : private IrVisitor {
   void handle(GridWelford* grid_welford) final {
     summary_.has_welford = true;
     summary_.has_grid_welford = true;
-    const auto dom =
-        grid_welford->welford_op()->out()->as<TensorIndex>()->view()->domain();
-    updateGridReductionInLoop(dom);
+    summary_.has_grid_reductions = true;
+    if (grid_welford->welford_op()->isAllreduce()) {
+      summary_.has_cooperative_grid_reduction = true;
+    }
   }
 
   void handle(GridReduction* grid_reduction) final {
     summary_.has_grid_reductions = true;
-    const auto dom = ir_utils::getTvOutput(grid_reduction)->domain();
-    updateGridReductionInLoop(dom);
+    if (grid_reduction->isAllreduce()) {
+      summary_.has_cooperative_grid_reduction = true;
+    }
   }
 
   void handle(GroupedGridReduction* grid_reduction) final {
@@ -156,8 +158,6 @@ class KernelIrScanner : private IrVisitor {
 
  private:
   void updateGridReductionInLoop(TensorDomain* dom) {
-    summary_.has_grid_reductions = true;
-
     for (const auto i : c10::irange(dom->nDims())) {
       const auto id = GpuLower::current()->caMap()->getConcreteMappedID(
           dom->domain()[i], IdMappingMode::LOOP);

diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
@@ -435,6 +435,8 @@ GridReduction::GridReduction(
     Val* in,
     Allocate* reduction_buffer,
     Allocate* sync_buffer,
+    Val* entrance_index,
+    Val* entrances,
     bool is_allreduce)
     : ReductionOp(
           passkey,
@@ -445,7 +447,9 @@ GridReduction::GridReduction(
           is_allreduce,
           ExprType::GridReduction),
       reduction_buffer_(reduction_buffer),
-      sync_buffer_(sync_buffer) {
+      sync_buffer_(sync_buffer),
+      entrance_index_(entrance_index),
+      entrances_(entrances) {
   TORCH_INTERNAL_ASSERT(
       passkey.ir_container_->isA<kir::Kernel>(),
       "IR type only valid for Kernel container.");
@@ -495,13 +499,17 @@ GridWelford::GridWelford(
     Allocate* var_buffer,
     Allocate* avg_buffer,
     Allocate* n_buffer,
-    Allocate* sync_buffer)
+    Allocate* sync_buffer,
+    Val* entrance_index,
+    Val* entrances)
     : Expr(passkey, ExprType::GridWelford),
       welford_op_(welford_op),
       var_buffer_(var_buffer),
       avg_buffer_(avg_buffer),
       n_buffer_(n_buffer),
-      sync_buffer_(sync_buffer) {
+      sync_buffer_(sync_buffer),
+      entrance_index_(entrance_index),
+      entrances_(entrances) {
   TORCH_INTERNAL_ASSERT(
       passkey.ir_container_->isA<kir::Kernel>(),
       "IR type only valid for Kernel container.");

diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/torch/csrc/jit/codegen/cuda/kernel_ir.h
@@ -513,6 +513,8 @@ class TORCH_CUDA_CU_API GridReduction final : public ReductionOp {
       Val* in,
       Allocate* reduction_buffer,
       Allocate* sync_buffer,
+      Val* entrance_index,
+      Val* entrances,
       bool is_fused = false);
 
   Allocate* reduction_buffer() const {
@@ -523,6 +525,16 @@ class TORCH_CUDA_CU_API GridReduction final : public ReductionOp {
     return sync_buffer_;
   }
 
+  // Which instance of entering this grid reduction is this iteration?
+  Val* entrance_index() const {
+    return entrance_index_;
+  }
+
+  // How many times will this grid reduction be entered
+  Val* entrances() const {
+    return entrances_;
+  }
+
   const ParallelTypeBitmap& threadPredicate() const {
     return thread_predicate_;
   }
@@ -538,6 +550,8 @@ class TORCH_CUDA_CU_API GridReduction final : public ReductionOp {
   // use them, the thread predicate is held here separately from
   // Expr::predicate_.
   ParallelTypeBitmap thread_predicate_;
+  Val* entrance_index_ = nullptr;
+  Val* entrances_ = nullptr;
 };
 
 class TORCH_CUDA_CU_API GroupedGridReduction final : public GroupedReductionOp {
@@ -629,7 +643,9 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr {
       Allocate* var_buffer,
       Allocate* avg_buffer,
       Allocate* n_buffer,
-      Allocate* sync_buffer);
+      Allocate* sync_buffer,
+      Val* entrance_index,
+      Val* entrances);
 
   WelfordOp* welford_op() const {
     return welford_op_;
@@ -651,6 +667,16 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr {
     return sync_buffer_;
   }
 
+  // Which instance of entering this grid reduction is this iteration?
+  Val* entrance_index() const {
+    return entrance_index_;
+  }
+
+  // How many times will this grid reduction be entered
+  Val* entrances() const {
+    return entrances_;
+  }
+
   const ParallelTypeBitmap& threadPredicate() const {
     return thread_predicate_;
   }
@@ -665,6 +691,8 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr {
   Allocate* avg_buffer_ = nullptr;
   Allocate* n_buffer_ = nullptr;
   Allocate* sync_buffer_ = nullptr;
+  Val* entrance_index_ = nullptr;
+  Val* entrances_ = nullptr;
   // gridReduce has template flags for thread predicates. In order to
   // use them, the thread predicate is held here separately from
   // Expr::predicate_.

diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/torch/csrc/jit/codegen/cuda/lower_index.cpp
@@ -143,6 +143,7 @@ namespace {
 // size. For example, FusedReduction should double the work buffer size.
 Val* getGridCommWorkBufferSize(
     const TensorDomain* td,
+    const std::vector<kir::ForLoop*>& for_loops = {},
     int expansion_factor = 1) {
   // The buffer size is the number of thread blocks multiplied by the
   // number of threads not used for reduction domains.
@@ -172,10 +173,28 @@ Val* getGridCommWorkBufferSize(
     }
     buffer_size = SimplifyingIrBuilder::mulExpr(buffer_size, pt_dim);
   }
+
+  // All iteration domains require a separate entry in the buffer for re-entrant
+  // grid reductions.
+  for (auto fl : for_loops) {
+    if (fl->isTrivial()) {
+      continue;
+    }
+    if (fl->iter_domain()->isThread()) {
+      // already accounted for.
+      continue;
+    }
+
+    buffer_size =
+        SimplifyingIrBuilder::mulExpr(buffer_size, fl->iter_domain()->extent());
+  }
+
   return buffer_size;
 }
 
-Val* getGridSyncBufferSize(const TensorDomain* td) {
+Val* getGridSyncBufferSize(
+    const TensorDomain* td,
+    const std::vector<kir::ForLoop*>& for_loops = {}) {
   // See the comment above for getGridCommWorkBufferSize.
   Val* buffer_size = GpuLower::current()->kernel()->oneVal();
   for (auto pt : kParallelTypeBIDs) {
@@ -191,9 +210,66 @@ Val* getGridSyncBufferSize(const TensorDomain* td) {
     }
     buffer_size = SimplifyingIrBuilder::mulExpr(buffer_size, pt_dim);
   }
+
+  // All iteration domains require a separate semaphore for re-entrant grid
+  // reductions
+  for (auto fl : for_loops) {
+    if (fl->isTrivial()) {
+      continue;
+    }
+    if (fl->iter_domain()->isThread()) {
+      // already accounted for.
+      continue;
+    }
+
+    buffer_size =
+        SimplifyingIrBuilder::mulExpr(buffer_size, fl->iter_domain()->extent());
+  }
+
   return buffer_size;
 }
 
+Val* getEntranceCountGridReduce(std::vector<kir::ForLoop*>& for_loops) {
+  Val* grid_reduction_entrances = GpuLower::current()->kernel()->oneVal();
+
+  for (const auto loop : for_loops) {
+    if (loop->isTrivial()) {
+      continue;
+    }
+    if (loop->iter_domain()->isThread()) {
+      // already accounted for.
+      continue;
+    }
+    // TODO: Does this work for shift/gather?
+    grid_reduction_entrances = SimplifyingIrBuilder::mulExpr(
+        grid_reduction_entrances, loop->iter_domain()->extent());
+  }
+  return grid_reduction_entrances;
+}
+
+// Linear indexing of for loops for multiple entrances into grid reduce
+// TODO: What happens if there's a broadcast that's resolved (not present in the
+// grid reduce) but the global buffer isn't expanded?
+Val* getEntranceLinIndGridReduce(std::vector<kir::ForLoop*>& for_loops) {
+  Val* linear_index = GpuLower::current()->kernel()->zeroVal();
+
+  for (const auto loop : for_loops) {
+    if (loop->isTrivial()) {
+      continue;
+    }
+    if (loop->iter_domain()->isThread()) {
+      // already accounted for.
+      continue;
+    }
+    // TODO: Does this work for shift/gather?
+    linear_index = SimplifyingIrBuilder::addExpr(
+        SimplifyingIrBuilder::mulExpr(
+            linear_index, loop->iter_domain()->extent()),
+        loop->index());
+  }
+  return linear_index;
+}
+
 } // namespace
 
 void IndexLowering::handle(const ReductionOp* rop) {
@@ -271,12 +347,25 @@ void IndexLowering::handleGridReduction(
 
   const auto reduce_buffer = ir_utils::allocGlobalBufferForGridComm(
       getGridCommWorkBufferSize(
-          out_domain, rop->isAllreduce() && is_within_a_loop ? 2 : 1),
+          out_domain,
+          rop->isAllreduce() ? std::vector<kir::ForLoop*>() : for_loops_,
+          rop->isAllreduce() && is_within_a_loop ? 2 : 1),
       out->dtype(),
       false);
 
   const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm(
-      getGridSyncBufferSize(out_domain), DataType::Int, true);
+      getGridSyncBufferSize(
+          out_domain,
+          rop->isAllreduce() ? std::vector<kir::ForLoop*>() : for_loops_),
+      DataType::Int,
+      true);
+
+  const auto entrance_ind = rop->isAllreduce()
+      ? GpuLower::current()->kernel()->zeroVal()
+      : getEntranceLinIndGridReduce(for_loops_);
+  const auto n_entrances = rop->isAllreduce()
+      ? GpuLower::current()->kernel()->oneVal()
+      : getEntranceCountGridReduce(for_loops_);
 
   // The thread predicate for GridReduction needs to be set
   // separately from the main predicate. Do not combine them like
@@ -291,6 +380,8 @@ void IndexLowering::handleGridReduction(
       in,
       reduce_buffer,
       sync_buffer,
+      entrance_ind,
+      n_entrances,
       rop->isAllreduce());
 
   grid_reduction->setThreadPredicate(thread_pred);
@@ -412,13 +503,14 @@ void IndexLowering::handleGridReduction(
         return ir_utils::allocGlobalBufferForGridComm(
             getGridCommWorkBufferSize(
                 out_domain,
+                for_loops_,
                 (grouped_rop->isAllreduce() && is_within_a_loop ? 2 : 1)),
             output->dtype(),
             false);
       });
 
   const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm(
-      getGridSyncBufferSize(out_domain), DataType::Int, true);
+      getGridSyncBufferSize(out_domain, for_loops_), DataType::Int, true);
 
   // The thread predicate for GridReduction needs to be set
   // separately from the main predicate. Do not combine them like
@@ -547,7 +639,9 @@ void IndexLowering::handleGridWelford(WelfordOp* indexed_wop) {
       [](IterDomain* id) { return !isTrivialIterDomain(id); });
 
   const auto work_buffer_size = getGridCommWorkBufferSize(
-      out_domain, indexed_wop->isAllreduce() && is_within_a_loop ? 2 : 1);
+      out_domain,
+      indexed_wop->isAllreduce() ? std::vector<kir::ForLoop*>() : for_loops_,
+      indexed_wop->isAllreduce() && is_within_a_loop ? 2 : 1);
 
   const auto out_var_buffer = ir_utils::allocGlobalBufferForGridComm(
       work_buffer_size, indexed_wop->outVar()->dtype(), false);
@@ -557,7 +651,19 @@ void IndexLowering::handleGridWelford(WelfordOp* indexed_wop) {
       work_buffer_size, indexed_wop->outN()->dtype(), false);
 
   const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm(
-      getGridSyncBufferSize(out_domain), DataType::Int, true);
+      getGridSyncBufferSize(
+          out_domain,
+          indexed_wop->isAllreduce() ? std::vector<kir::ForLoop*>()
+                                     : for_loops_),
+      DataType::Int,
+      true);
+
+  const auto entrance_ind = indexed_wop->isAllreduce()
+      ? GpuLower::current()->kernel()->zeroVal()
+      : getEntranceLinIndGridReduce(for_loops_);
+  const auto n_entrances = indexed_wop->isAllreduce()
+      ? GpuLower::current()->kernel()->oneVal()
+      : getEntranceCountGridReduce(for_loops_);
 
   // The thread predicate for GridReduction needs to be set
   // separately from the main predicate. Do not combine them like
@@ -566,7 +672,13 @@ void IndexLowering::handleGridWelford(WelfordOp* indexed_wop) {
       GpuLower::current()->threadPredMap().getPredicatedParallelTypes(out_tv);
 
   auto grid_welford = IrBuilder::create<kir::GridWelford>(
-      indexed_wop, out_var_buffer, out_avg_buffer, out_N_buffer, sync_buffer);
+      indexed_wop,
+      out_var_buffer,
+      out_avg_buffer,
+      out_N_buffer,
+      sync_buffer,
+      entrance_ind,
+      n_entrances);
 
   grid_welford->setThreadPredicate(thread_pred);