Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Llu/ln bwd #207

Merged
merged 4 commits into from
Apr 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ if(BUILD_TEST)
${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp
${NVFUSER_ROOT}/test/test_gpu_multidevice.cpp
${NVFUSER_ROOT}/test/test_multicluster_fusion.cpp
${NVFUSER_ROOT}/test/test_combined_inner_outer_reduction.cpp
)
list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)

Expand Down
25 changes: 5 additions & 20 deletions csrc/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -998,26 +998,11 @@ c10::optional<int> getMaxRegCount(
// If the block size is known, set the maximum that at least allows
// one block to be resident on an SM
if (opt_block_size.has_value() && opt_block_size.value() > 0) {
int num_partition = 0;
int reg_allocation_granularity = 0;
const auto prop = at::cuda::getCurrentDeviceProperties();
cudaOccDeviceProp occ_prop(*prop);
cudaOccSubPartitionsPerMultiprocessor(&num_partition, &occ_prop);
cudaOccRegAllocationGranularity(&reg_allocation_granularity, &occ_prop);
int warp_size = prop->warpSize;
int64_t num_warps = ceilDiv(opt_block_size.value(), warp_size);

// warps could be distributed unevenly across partition
int64_t max_warps_per_sm_partition = ceilDiv(num_warps, num_partition);
// registers are evenly distributed across partitions, partition with most
// wraps determins the maximum register available per warp
int max_reg_per_warp =
prop->regsPerBlock / num_partition / (int)max_warps_per_sm_partition;
// clamp down to register allocation granularity at warp level
int effective_max_reg_per_warp = max_reg_per_warp /
reg_allocation_granularity * reg_allocation_granularity;
max_register =
std::min(max_register_limit, effective_max_reg_per_warp / warp_size);
constexpr int block_per_sm = 1;
max_register = std::min(
max_register_limit,
(int)getRegPerThreadGivenThreadsPerSM(
opt_block_size.value() * block_per_sm));
}

// If a heuristic value is given, i.e., max_register_heuristic is
Expand Down
4 changes: 4 additions & 0 deletions csrc/ir_internal_nodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
return getIterType() == IterType::Reduction;
}

bool isIteration() const {
return getIterType() == IterType::Iteration;
}

bool isRFactorProduct() const {
return is_rfactor_domain_;
}
Expand Down
3 changes: 2 additions & 1 deletion csrc/kernel_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,8 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
most_recent_executor_log_.params = scheduler_entry->params()->clone();
}

if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose) ||
measure_kernel_time_) {
executor.setMeasureKernelTimeFlag(true);
}

Expand Down
5 changes: 5 additions & 0 deletions csrc/kernel_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
profiling_ = to_profile;
}

void setMeasureKernelTime(bool val = true) {
measure_kernel_time_ = val;
}

//! Internal knob for profiling shape inference
void disableLaunchParamCache() {
for (auto& executor : executors_) {
Expand Down Expand Up @@ -230,6 +234,7 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {

// States for profiling support
bool profiling_ = false;
bool measure_kernel_time_ = false;

std::mutex mutex_;

Expand Down
20 changes: 20 additions & 0 deletions csrc/maxinfo_propagator.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,4 +280,24 @@ class TORCH_CUDA_CU_API SetSelector : public MaxInfoSpanningTree::Selector {
}
};

// Simple selector to allow different parallel patterns in the fusion.
// The propagation is blocked at boundaryNodesSet.
// For P2C forward propagate, disable propagation to tensorViews in
// boundaryNodesSet. For C2P backward propagate, disable propagation from
// tensorViews in boundaryNodesSet
struct InternalBoundarySelector : public MaxInfoSpanningTree::Selector {
std::unordered_set<TensorView*> tvs_;
virtual bool allowC2P(TensorView* from, TensorView* to) override {
return tvs_.count(from) == 0;
};
virtual bool allowP2C(TensorView* from, TensorView* to) override {
return tvs_.count(to) == 0;
};
virtual bool allowSibling(TensorView* from, TensorView* to) override {
return true;
}
InternalBoundarySelector(const std::unordered_set<TensorView*>& tvs)
: tvs_(tvs) {}
};

} // namespace nvfuser
Loading