From 89b9128cfefe017eba9dd770d319109aa8c6b4e4 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Tue, 2 Jul 2024 11:51:52 +0000 Subject: [PATCH] Update CUTLASS to v3.5.0 ghstack-source-id: 887e806ca45188df99edb4265efab1aae1714608 Pull Request resolved: https://github.com/fairinternal/xformers/pull/1147 __original_commit__ = fairinternal/xformers@6f0c5d038038f289a6d0a011d0abe62c350ac20e --- .github/workflows/conda.yml | 2 +- .github/workflows/wheels_build.yml | 2 +- setup.py | 6 +++++- third_party/cutlass | 2 +- .../csrc/attention/cuda/fmha/kernel_backward.h | 2 +- xformers/csrc/sparse24/compute_sparse_tile.h | 6 +++--- xformers/csrc/sparse24/sparse24_pack.h | 16 ++++++++-------- 7 files changed, 20 insertions(+), 16 deletions(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 32f970c268..2d7483998d 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -19,7 +19,7 @@ on: env: # you need at least cuda 5.0 for some of the stuff compiled here. - TORCH_CUDA_ARCH_LIST: "5.0+PTX 6.0 6.1 7.0 7.5 8.0+PTX" + TORCH_CUDA_ARCH_LIST: "6.0+PTX 6.1 7.0 7.5 8.0+PTX" MAX_JOBS: 3 # Avoids OOMs XFORMERS_BUILD_TYPE: "Release" XFORMERS_PACKAGE_FROM: "conda-${{ github.ref_name }}" diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml index 0b398822b3..68ab3bf444 100644 --- a/.github/workflows/wheels_build.yml +++ b/.github/workflows/wheels_build.yml @@ -26,7 +26,7 @@ on: env: # you need at least cuda 5.0 for some of the stuff compiled here. - TORCH_CUDA_ARCH_LIST: "5.0+PTX 6.0 6.1 7.0 7.5 8.0+PTX" + TORCH_CUDA_ARCH_LIST: "6.0+PTX 6.1 7.0 7.5 8.0+PTX" MAX_JOBS: 4 DISTUTILS_USE_SDK: 1 # otherwise distutils will complain on windows about multiple versions of msvc XFORMERS_BUILD_TYPE: "Release" diff --git a/setup.py b/setup.py index 327e1f7df6..c8b78dd1f7 100644 --- a/setup.py +++ b/setup.py @@ -316,7 +316,9 @@ def get_extensions(): extra_compile_args = {"cxx": ["-O3", "-std=c++17"]} if sys.platform == "win32": define_macros += [("xformers_EXPORTS", None)] - extra_compile_args["cxx"].extend(["/MP", "/Zc:lambda", "/Zc:preprocessor"]) + extra_compile_args["cxx"].extend( + ["/MP", "/Zc:lambda", "/Zc:preprocessor", "/Zc:__cplusplus"] + ) elif "OpenMP not found" not in torch.__config__.parallel_info(): extra_compile_args["cxx"].append("-fopenmp") @@ -360,6 +362,8 @@ def get_extensions(): "/Zc:lambda", "-Xcompiler", "/Zc:preprocessor", + "-Xcompiler", + "/Zc:__cplusplus", ] extra_compile_args["nvcc"] = nvcc_flags diff --git a/third_party/cutlass b/third_party/cutlass index e0aaa3c3b3..7d49e6c7e2 160000 --- a/third_party/cutlass +++ b/third_party/cutlass @@ -1 +1 @@ -Subproject commit e0aaa3c3b38db9a89c31f04fef91e92123ad5e2e +Subproject commit 7d49e6c7e2f8896c47f586706e67e1fb215529dc diff --git a/xformers/csrc/attention/cuda/fmha/kernel_backward.h b/xformers/csrc/attention/cuda/fmha/kernel_backward.h index 9ed0e6ba49..41eb985c01 100644 --- a/xformers/csrc/attention/cuda/fmha/kernel_backward.h +++ b/xformers/csrc/attention/cuda/fmha/kernel_backward.h @@ -1441,7 +1441,7 @@ struct AttentionBackwardKernel { uint8_t lane_id) { cutlass::Array dropout_keep_mask_doivj; - dropout_keep_mask_doivj.fill(1); + dropout_keep_mask_doivj.fill(cutlass::uint1b_t{1}); const float dropout_scale = kApplyDropout ? 1.0 / (1.0 - p.dropout_prob) : 1.0f; diff --git a/xformers/csrc/sparse24/compute_sparse_tile.h b/xformers/csrc/sparse24/compute_sparse_tile.h index 224998ec41..464746321c 100644 --- a/xformers/csrc/sparse24/compute_sparse_tile.h +++ b/xformers/csrc/sparse24/compute_sparse_tile.h @@ -85,8 +85,8 @@ struct LargestValuesGreedy { for (int j = 0; j < 4; ++j) { TileValueOrdered& v = values_ordered[i * 4 + j]; v.parts.value = values.at(i, j).get(); - v.parts.col = j; - v.parts.row = i; + v.parts.col = uint2b_t{j}; + v.parts.row = uint2b_t{i}; } } // Use a sorting network (aka without branches) to avoid @@ -150,7 +150,7 @@ struct Causal1122 { for (int col = 0; col < 4; ++col) { TileValueOrdered& v = values_ordered[col]; v.parts.value = values.at(row, col).get(); - v.parts.col = col; + v.parts.col = uint2b_t{col}; } // Use a sorting network (aka without branches) to avoid // warp divergence diff --git a/xformers/csrc/sparse24/sparse24_pack.h b/xformers/csrc/sparse24/sparse24_pack.h index 2158966b97..7ca874d30e 100644 --- a/xformers/csrc/sparse24/sparse24_pack.h +++ b/xformers/csrc/sparse24/sparse24_pack.h @@ -77,9 +77,9 @@ warp_shuffle_meta(uint32_t meta_ab, bool transposed = false) { uint8b_t(meta_ab >> (8 * (thread_left + 2)))}; // shfl t0-t4 / t1-t5 stage0_data[0] = - __shfl_xor_sync(0xffffffff, stage0_data[0], transposed ? 1 : 4); + uint8b_t{__shfl_xor_sync(0xffffffff, stage0_data[0], transposed ? 1 : 4)}; stage0_data[1] = - __shfl_xor_sync(0xffffffff, stage0_data[1], transposed ? 1 : 4); + uint8b_t{__shfl_xor_sync(0xffffffff, stage0_data[1], transposed ? 1 : 4)}; uint16_t line0 = int(uint8b_t(meta_ab >> (8 * (1 - thread_left)))) << ((1 - thread_left) * 8); @@ -229,24 +229,24 @@ struct KernelTypes { // We know that col0 is always packed to position 0 if it's there // and col1 is packed to pos 0 or 1 (depending if col0 is selected) if (isSelected(1)) { - packValue(0, 1); + packValue(uint2b_t{0}, uint2b_t{1}); } if (isSelected(0)) { - packValue(0, 0); + packValue(uint2b_t{0}, uint2b_t{0}); } if (isSelected(0) && isSelected(1)) { - packValue(1, 1); + packValue(uint2b_t{1}, uint2b_t{1}); } // Process cols 2/3 // same sort of heuristic if (isSelected(2)) { - packValue(1, 2); + packValue(uint2b_t{1}, uint2b_t{2}); } if (isSelected(3)) { - packValue(1, 3); + packValue(uint2b_t{1}, uint2b_t{3}); } if (isSelected(2) && isSelected(3)) { - packValue(0, 2); + packValue(uint2b_t{0}, uint2b_t{2}); } int add_mask = (col0_from | (col1_from << 2)) << (8 * row + meta_pos); meta |= add_mask;