Skip to content

Commit

Permalink
patch from old repo
Browse files Browse the repository at this point in the history
  • Loading branch information
liqiangxl committed Apr 22, 2023
1 parent 73f4ae9 commit 68278d2
Show file tree
Hide file tree
Showing 19 changed files with 2,300 additions and 176 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ if(BUILD_TEST)
${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp
${NVFUSER_ROOT}/test/test_gpu_multidevice.cpp
${NVFUSER_ROOT}/test/test_multicluster_fusion.cpp
${NVFUSER_ROOT}/test/test_gpu_combined_inner_outer_reduction.cpp
)
list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)

Expand Down
25 changes: 5 additions & 20 deletions csrc/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -998,26 +998,11 @@ c10::optional<int> getMaxRegCount(
// If the block size is known, set the maximum that at least allows
// one block to be resident on an SM
if (opt_block_size.has_value() && opt_block_size.value() > 0) {
int num_partition = 0;
int reg_allocation_granularity = 0;
const auto prop = at::cuda::getCurrentDeviceProperties();
cudaOccDeviceProp occ_prop(*prop);
cudaOccSubPartitionsPerMultiprocessor(&num_partition, &occ_prop);
cudaOccRegAllocationGranularity(&reg_allocation_granularity, &occ_prop);
int warp_size = prop->warpSize;
int64_t num_warps = ceilDiv(opt_block_size.value(), warp_size);

// warps could be distributed unevenly across partition
int64_t max_warps_per_sm_partition = ceilDiv(num_warps, num_partition);
// registers are evenly distributed across partitions, partition with most
// wraps determins the maximum register available per warp
int max_reg_per_warp =
prop->regsPerBlock / num_partition / (int)max_warps_per_sm_partition;
// clamp down to register allocation granularity at warp level
int effective_max_reg_per_warp = max_reg_per_warp /
reg_allocation_granularity * reg_allocation_granularity;
max_register =
std::min(max_register_limit, effective_max_reg_per_warp / warp_size);
constexpr int block_per_sm = 1;
max_register = std::min(
max_register_limit,
(int)getRegPerThreadGivenThreadsPerSM(
opt_block_size.value() * block_per_sm));
}

// If a heuristic value is given, i.e., max_register_heuristic is
Expand Down
4 changes: 4 additions & 0 deletions csrc/ir_internal_nodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
return getIterType() == IterType::Reduction;
}

bool isIteration() const {
return getIterType() == IterType::Iteration;
}

bool isRFactorProduct() const {
return is_rfactor_domain_;
}
Expand Down
3 changes: 2 additions & 1 deletion csrc/kernel_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,8 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
most_recent_executor_log_.params = scheduler_entry->params()->clone();
}

if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose) ||
measure_kernel_time_) {
executor.setMeasureKernelTimeFlag(true);
}

Expand Down
5 changes: 5 additions & 0 deletions csrc/kernel_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
profiling_ = to_profile;
}

void setMeasureKernelTime(bool val = true) {
measure_kernel_time_ = val;
}

//! Internal knob for profiling shape inference
void disableLaunchParamCache() {
for (auto& executor : executors_) {
Expand Down Expand Up @@ -230,6 +234,7 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {

// States for profiling support
bool profiling_ = false;
bool measure_kernel_time_ = false;

std::mutex mutex_;

Expand Down
20 changes: 20 additions & 0 deletions csrc/maxinfo_propagator.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,4 +280,24 @@ class TORCH_CUDA_CU_API SetSelector : public MaxInfoSpanningTree::Selector {
}
};

// Simple selector to allow different parallel patterns in the fusion.
// The propagation is blocked at boundaryNodesSet.
// For P2C forward propagate, disable propagation to tensorViews in
// boundaryNodesSet. For C2P backward propagate, disable propagation from
// tensorViews in boundaryNodesSet
struct InternalBoundarySelector : public MaxInfoSpanningTree::Selector {
std::unordered_set<TensorView*> tvs_;
virtual bool allowC2P(TensorView* from, TensorView* to) override {
return tvs_.count(from) == 0;
};
virtual bool allowP2C(TensorView* from, TensorView* to) override {
return tvs_.count(to) == 0;
};
virtual bool allowSibling(TensorView* from, TensorView* to) override {
return true;
}
InternalBoundarySelector(const std::unordered_set<TensorView*>& tvs)
: tvs_(tvs) {}
};

} // namespace nvfuser
Loading

0 comments on commit 68278d2

Please sign in to comment.