Skip to content

Commit

Permalink
Llu/ln bwd (#207)
Browse files Browse the repository at this point in the history
combines inner and outer reduction in one kernel in cases where at least one input is used by inner and outer reductions, e.g. layer norm backward.
  • Loading branch information
liqiangxl authored Apr 25, 2023
1 parent 8884858 commit 0250132
Show file tree
Hide file tree
Showing 19 changed files with 2,311 additions and 176 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ if(BUILD_TEST)
${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp
${NVFUSER_ROOT}/test/test_gpu_multidevice.cpp
${NVFUSER_ROOT}/test/test_multicluster_fusion.cpp
${NVFUSER_ROOT}/test/test_combined_inner_outer_reduction.cpp
)
list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)

Expand Down
25 changes: 5 additions & 20 deletions csrc/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -998,26 +998,11 @@ c10::optional<int> getMaxRegCount(
// If the block size is known, set the maximum that at least allows
// one block to be resident on an SM
if (opt_block_size.has_value() && opt_block_size.value() > 0) {
int num_partition = 0;
int reg_allocation_granularity = 0;
const auto prop = at::cuda::getCurrentDeviceProperties();
cudaOccDeviceProp occ_prop(*prop);
cudaOccSubPartitionsPerMultiprocessor(&num_partition, &occ_prop);
cudaOccRegAllocationGranularity(&reg_allocation_granularity, &occ_prop);
int warp_size = prop->warpSize;
int64_t num_warps = ceilDiv(opt_block_size.value(), warp_size);

// warps could be distributed unevenly across partition
int64_t max_warps_per_sm_partition = ceilDiv(num_warps, num_partition);
// registers are evenly distributed across partitions, partition with most
// wraps determins the maximum register available per warp
int max_reg_per_warp =
prop->regsPerBlock / num_partition / (int)max_warps_per_sm_partition;
// clamp down to register allocation granularity at warp level
int effective_max_reg_per_warp = max_reg_per_warp /
reg_allocation_granularity * reg_allocation_granularity;
max_register =
std::min(max_register_limit, effective_max_reg_per_warp / warp_size);
constexpr int block_per_sm = 1;
max_register = std::min(
max_register_limit,
(int)getRegPerThreadGivenThreadsPerSM(
opt_block_size.value() * block_per_sm));
}

// If a heuristic value is given, i.e., max_register_heuristic is
Expand Down
4 changes: 4 additions & 0 deletions csrc/ir_internal_nodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
return getIterType() == IterType::Reduction;
}

bool isIteration() const {
return getIterType() == IterType::Iteration;
}

bool isRFactorProduct() const {
return is_rfactor_domain_;
}
Expand Down
3 changes: 2 additions & 1 deletion csrc/kernel_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,8 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
most_recent_executor_log_.params = scheduler_entry->params()->clone();
}

if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose) ||
measure_kernel_time_) {
executor.setMeasureKernelTimeFlag(true);
}

Expand Down
5 changes: 5 additions & 0 deletions csrc/kernel_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
profiling_ = to_profile;
}

void setMeasureKernelTime(bool val = true) {
measure_kernel_time_ = val;
}

//! Internal knob for profiling shape inference
void disableLaunchParamCache() {
for (auto& executor : executors_) {
Expand Down Expand Up @@ -230,6 +234,7 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {

// States for profiling support
bool profiling_ = false;
bool measure_kernel_time_ = false;

std::mutex mutex_;

Expand Down
20 changes: 20 additions & 0 deletions csrc/maxinfo_propagator.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,4 +280,24 @@ class TORCH_CUDA_CU_API SetSelector : public MaxInfoSpanningTree::Selector {
}
};

// Simple selector to allow different parallel patterns in the fusion.
// The propagation is blocked at boundaryNodesSet.
// For P2C forward propagate, disable propagation to tensorViews in
// boundaryNodesSet. For C2P backward propagate, disable propagation from
// tensorViews in boundaryNodesSet
struct InternalBoundarySelector : public MaxInfoSpanningTree::Selector {
std::unordered_set<TensorView*> tvs_;
virtual bool allowC2P(TensorView* from, TensorView* to) override {
return tvs_.count(from) == 0;
};
virtual bool allowP2C(TensorView* from, TensorView* to) override {
return tvs_.count(to) == 0;
};
virtual bool allowSibling(TensorView* from, TensorView* to) override {
return true;
}
InternalBoundarySelector(const std::unordered_set<TensorView*>& tvs)
: tvs_(tvs) {}
};

} // namespace nvfuser
Loading

0 comments on commit 0250132

Please sign in to comment.