csarofeen · naoyam · May 10, 2022 · May 10, 2022 · May 11, 2022 · May 12, 2022
diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
@@ -468,7 +468,9 @@ class TORCH_CUDA_CU_API TensorView : public Val {
     domain_ = td;
   }
 
+ public:
   void setComputeAt(unsigned int this_pos, bool decrease = false);
+
 
   void setMaxProducer(unsigned int this_pos, bool decrease = false);
 

diff --git a/torch/csrc/jit/codegen/cuda/runtime/welford.cu b/torch/csrc/jit/codegen/cuda/runtime/welford.cu
@@ -15,7 +15,12 @@ __inline__ __device__ void welfordCombine(
     return;
   }
   TN ab_N = a_N + b_N;
+#if 1
   T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
+#else
+  // No perf change
+  T b_N_div_ab_N = a_N == b_N ? 0.5f : (((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N)));
+#endif
   T delta = b_avg - a_avg;
   a_avg += delta * b_N_div_ab_N;
   a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
@@ -350,10 +355,17 @@ __device__ void gridWelford(
     grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
         sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
   } else {
+#if 0
     grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK>(
         sync_flags[idx_in_grid_segment],
         grid_reduction_segment_size,
         n_entrances_);
+#else
+    // Assumes separate sync flags are allocated for each call.
+    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
+        sync_flags[entrance_ind_ * grid_segment_size + idx_in_grid_segment],
+        grid_reduction_segment_size);
+#endif
   }
 
   bool last_block =
-Original file line number
+Diff line change
@@ Expand Up / @@ -468,7 +468,9 @@ class TORCH_CUDA_CU_API TensorView : public Val { @@
         domain_ = td;
       }
+     public:
       void setComputeAt(unsigned int this_pos, bool decrease = false);
       void setMaxProducer(unsigned int this_pos, bool decrease = false);
@@ Expand Down @@