Skip to content

Commit

Permalink
Merge pull request #4 from Flamefire/pr-to-jfgrim-pytorch-cuda-2.1.2
Browse files Browse the repository at this point in the history
Enhance PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb
  • Loading branch information
jfgrimm authored Feb 13, 2024
2 parents bf06a6c + a9efd81 commit c2fb50b
Show file tree
Hide file tree
Showing 8 changed files with 278 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,6 @@ author: Alex Domingo (Vrije Universiteit Brussel)
def test_partial_flat_weights(self):
input_size = 10
hidden_size = 6
--- test/jit/test_freezing.py.orig 2024-01-15 14:38:11.054125484 +0100
+++ test/jit/test_freezing.py 2024-01-15 14:49:41.689011617 +0100
@@ -2733,7 +2733,11 @@
else:
FileCheck().check("aten::cudnn_convolution_relu").run(frozen_mod.graph)

- self.assertEqual(mod_eager(inp), frozen_mod(inp))
+ if not TEST_WITH_ROCM:
+ with torch.backends.cudnn.flags(enabled=True, allow_tf32=False):
+ self.assertEqual(mod_eager(inp), frozen_mod(inp))
+ else:
+ self.assertEqual(mod_eager(inp), frozen_mod(inp))

@unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
def test_freeze_conv_relu_fusion_not_forward(self):
--- ../PyTorch/2.1.2/foss-2023a-CUDA-12.1.1/pytorch-v2.1.2/test/nn/test_convolution.py 2023-12-15 03:03:27.000000000 +0100
+++ test/nn/test_convolution.py 2024-01-15 15:03:15.606208376 +0100
@@ -518,7 +518,7 @@
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ patches = [
'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
'PyTorch-2.1.0_disable-gcc12-warning.patch',
'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch',
'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch',
'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch',
'PyTorch-2.1.0_fix-validationError-output-test.patch',
Expand All @@ -43,15 +44,19 @@ patches = [
'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch',
'PyTorch-2.1.0_skip-test_wrap_bad.patch',
'PyTorch-2.1.2_add-cuda-skip-markers.patch',
'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch',
'PyTorch-2.1.2_fix-device-mesh-check.patch',
'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch',
'PyTorch-2.1.2_fix-test_memory_profiler.patch',
'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch',
'PyTorch-2.1.2_fix-vsx-vector-abs.patch',
'PyTorch-2.1.2_fix-vsx-vector-div.patch',
'PyTorch-2.1.2_relax-cuda-tolerances.patch',
'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch',
'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch',
'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch',
'PyTorch-2.1.0_skip-test-linalg-svd-complex.patch',
'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch',
]
checksums = [
{'pytorch-v2.1.2.tar.gz': '85effbcce037bffa290aea775c9a4bad5f769cb229583450c40055501ee1acd7'},
Expand Down Expand Up @@ -87,6 +92,8 @@ checksums = [
{'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
'166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
{'PyTorch-2.1.0_disable-gcc12-warning.patch': 'c858b8db0010f41005dc06f9a50768d0d3dc2d2d499ccbdd5faf8a518869a421'},
{'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch':
'd895018ebdfd46e65d9f7645444a3b4c5bbfe3d533a08db559a04be34e01e478'},
{'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch':
'b15b1291a3c37bf6a4982cfbb3483f693acb46a67bc0912b383fd98baf540ccf'},
{'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch':
Expand All @@ -109,6 +116,10 @@ checksums = [
{'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch':
'5dcc79883b6e3ec0a281a8e110db5e0a5880de843bb05653589891f16473ead5'},
{'PyTorch-2.1.0_skip-test_wrap_bad.patch': 'b8583125ee94e553b6f77c4ab4bfa812b89416175dc7e9b7390919f3b485cb63'},
{'PyTorch-2.1.2_add-cuda-skip-markers.patch': 'd007d6d0cdb533e7d01f503e9055218760123a67c1841c57585385144be18c9a'},
{'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch':
'c164357efa4ce88095376e590ba508fc1daa87161e1e59544eda56daac7f2847'},
{'PyTorch-2.1.2_fix-device-mesh-check.patch': 'c0efc288bf3d9a9a3c8bbd2691348a589a2677ea43880a8c987db91c8de4806b'},
{'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch':
'cd1455495886a7d6b2d30d48736eb0103fded21e2e36de6baac719b9c52a1c92'},
{'PyTorch-2.1.2_fix-test_memory_profiler.patch':
Expand All @@ -117,14 +128,15 @@ checksums = [
'a0ef99192ee2ad1509c78a8377023d5be2b5fddb16f84063b7c9a0b53d979090'},
{'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'},
{'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'},
{'PyTorch-2.1.2_relax-cuda-tolerances.patch': '554ad09787f61080fafdb84216e711e32327aa357e2a9c40bb428eb6503dee6e'},
{'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch':
'e6a1efe3d127fcbf4723476a7a1c01cfcf2ccb16d1fb250f478192623e8b6a15'},
{'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
'7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
{'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch':
'6cf711bf26518550903b09ed4431de9319791e79d61aab065785d6608fd5cc88'},
{'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch':
'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'},
{'PyTorch-2.1.0_skip-test-linalg-svd-complex.patch':
'5ba7e0b4203ea8c27b55b5231de024004697aca7bbae30aa248524babb451dc7'},
{'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch':
'7abccc94f0ae6c317d5d08d4db4e3724eedde8d1d00707e78cf57d8cbf858be5'},
]

osdependencies = [OS_PKG_IBVERBS_DEV]
Expand All @@ -140,6 +152,9 @@ builddependencies = [

dependencies = [
('CUDA', '12.1.1', '', SYSTEM),
('cuDNN', '8.9.2.26', '-CUDA-%(cudaver)s', SYSTEM),
('magma', '2.7.2', '-CUDA-%(cudaver)s'),
('NCCL', '2.18.3', '-CUDA-%(cudaver)s'),
('Ninja', '1.11.1'), # Required for JIT compilation of C++ extensions
('Python', '3.11.3'),
('Python-bundle-PyPI', '2023.06'),
Expand All @@ -157,11 +172,11 @@ dependencies = [
('networkx', '3.1'),
('sympy', '1.12'),
('Z3', '4.12.2', '-Python-%(pyver)s'),
('cuDNN', '8.9.2.26', versionsuffix, SYSTEM),
('magma', '2.7.2', versionsuffix),
('NCCL', '2.18.3', versionsuffix),
]

use_pip = True
buildcmd = '%(python)s setup.py build' # Run the (long) build in the build step

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
Expand Down Expand Up @@ -192,7 +207,14 @@ runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-throu
# Especially test_quantization has a few corner cases that are triggered by the random input values,
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
# So allow a low number of tests to fail as the tests "usually" succeed
max_failed_tests = 30
max_failed_tests = 4

# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/109493 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]

tests = ['PyTorch-check-cpp-extension.py']

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
distributed/test_inductor_collectives & distributed/test_dynamo_distributed fail when run without GPUs
with "RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!"
Skip those in that case.
See https://github.com/pytorch/pytorch/pull/117741

Author: Alexander Grund (TU Dresden)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index c8fae62bd62..6220b62a9a9 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -30,6 +30,7 @@ from torch.testing._internal.common_distributed import (
requires_nccl,
_dynamo_dist_per_rank_init,
)
+from torch.testing._internal.common_utils import requires_cuda
import torch._dynamo.logging
from torch._dynamo.comptime import comptime

@@ -452,6 +453,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):


@requires_nccl()
+@requires_cuda
class TestSingleProc(DynamoDistributedSingleProcTestCase):
"""
Test harness initializes dist process group.
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 9183e2e9ce4..37149865fd9 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -19,6 +19,7 @@ from torch.testing._internal.common_distributed import (
requires_nccl,
skip_if_lt_x_gpu,
)
+from torch.testing._internal.common_utils import requires_cuda
from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
from torch._inductor.utils import has_triton, run_and_get_triton_code
import torch._dynamo.logging
@@ -216,6 +217,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):


@requires_nccl()
+@requires_cuda
class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
"""
Prefer single-proc test runner for basic tests as it is easier to work with.
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 1e18ca2afec..bad5af5212f 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1111,6 +1111,7 @@ if TEST_CUDA and 'NUM_PARALLEL_PROCS' in os.environ:
# other libraries take up about 11% of space per process
torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .11, 2))

+requires_cuda = unittest.skipUnless(torch.cuda.is_available(), "Requires CUDA")

def skipIfCrossRef(fn):
@wraps(fn)
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
In test_ops.py the tests test_python_ref_meta__refs_linalg_svd_cpu_complex* fail
when PyTorch is compiled with CUDA support with
> RuntimeError: Conj mismatch! is_conj is set to False and True

This is a known issue (https://github.com/pytorch/pytorch/issues/105068)
so don't check the flag in the test.

diff --git a/test/test_ops.py b/test/test_ops.py
index 08a8185d346..dc98e7c7439 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -302,6 +302,10 @@ class TestCommon(TestCase):
@ops(python_ref_db)
@skipIfTorchInductor("Takes too long for inductor")
def test_python_ref_meta(self, device, dtype, op):
+ CHECK_CONJ_SKIPS = {
+ torch._refs.linalg.svd,
+ }
+
with FakeTensorMode() as mode:
pass

@@ -328,12 +332,12 @@ class TestCommon(TestCase):

if isinstance(result, torch.Tensor):
self.assertTrue(isinstance(meta_result, FakeTensor))
- prims.utils.compare_tensor_meta(result, meta_result)
+ prims.utils.compare_tensor_meta(result, meta_result, check_conj=op.op not in CHECK_CONJ_SKIPS)
elif isinstance(result, Sequence):
for a, b in zip(result, meta_result):
if isinstance(a, torch.Tensor) or isinstance(b, torch.Tensor):
self.assertTrue(isinstance(b, FakeTensor))
- prims.utils.compare_tensor_meta(a, b)
+ prims.utils.compare_tensor_meta(a, b, check_conj=op.op not in CHECK_CONJ_SKIPS)

def _ref_test_helper(
self,
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index d60931162da..da217470930 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -90,7 +90,7 @@ def same_shape(a: ShapeType, b: ShapeType) -> bool:

# TODO: look at using torch.testing.assert_close instead with an option
# to just compare metadata
-def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=False):
+def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=False, check_conj=True):
"""
Checks that two tensor likes have the same shape,
dtype and device.
@@ -131,10 +131,11 @@ def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=Fals
msg = f"Storage offset mismatch! Storage offsets are {a.storage_offset()} and {b.storage_offset()}!"
raise RuntimeError(msg)

- if a.is_conj() != b.is_conj():
- raise RuntimeError(
- f"Conj mismatch! is_conj is set to {a.is_conj()} and {b.is_conj()}"
- )
+ if check_conj:
+ if a.is_conj() != b.is_conj():
+ raise RuntimeError(
+ f"Conj mismatch! is_conj is set to {a.is_conj()} and {b.is_conj()}"
+ )

if a.is_neg() != b.is_neg():
raise RuntimeError(
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Fix error when there are more GPUs than ranks:
> RuntimeError: DeviceMesh only support homogeneous hardware, but found 4 ranks and 8 cuda devices!

See https://github.com/pytorch/pytorch/pull/111091

diff --git a/torch/distributed/_tensor/device_mesh.py b/torch/distributed/_tensor/device_mesh.py
index b5e30eeca82..21ba82503a8 100644
--- a/torch/distributed/_tensor/device_mesh.py
+++ b/torch/distributed/_tensor/device_mesh.py
@@ -165,7 +165,10 @@ class DeviceMesh:
# automatically set the current cuda/cuda-like device base on num of gpu devices available in each host
# NOTE: This device selection would only work for homogeneous hardware.
num_devices_per_host = device_handle.device_count()
- if world_size % num_devices_per_host != 0:
+ if (
+ world_size > num_devices_per_host
+ and world_size % num_devices_per_host != 0
+ ):
raise RuntimeError(
f"DeviceMesh only support homogeneous hardware, but found "
f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"

Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
test_jit fails in test_freeze_conv_relu_fusion with
Mismatched elements: 7 / 30 (23.3%)
Greatest absolute difference: 3.053247928619385e-05 at index (1, 1, 0, 0, 0) (up to 1e-05 allowed)
Greatest relative difference: 0.0004548609140329063 at index (3, 1, 0, 0, 0) (up to 1.3e-06 allowed)

Increase the tolerance to allow this to pass.

Author: Alexander Grund (TU Dresden)

diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index c8c1441adbf..e0feffd6bb5 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -2733,7 +2733,7 @@ class TestFrozenOptimizations(JitTestCase):
else:
FileCheck().check("aten::cudnn_convolution_relu").run(frozen_mod.graph)

- self.assertEqual(mod_eager(inp), frozen_mod(inp))
+ self.assertEqual(mod_eager(inp), frozen_mod(inp), atol=1e-4, rtol=4e-3)

@unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
def test_freeze_conv_relu_fusion_not_forward(self):
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
In some code paths the ProcessGroupNCCL is created when PyTorch was compiled with NCCL.
However without any GPUs present at runtime the creation will fail with
> RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!

Remove NCCL as a available default backend if CUDA isn't available.
See https://github.com/pytorch/pytorch/issues/117746

Author: Alexander Grund (TU Dresden)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 098e209264c..80962466bff 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -271,9 +271,11 @@ class BackendConfig:
if backend == Backend.UNDEFINED:
# default config when backend is not specified
# supported since PyTorch 2.0
- for device in Backend.default_device_backend_map:
- if is_backend_available(Backend.default_device_backend_map[device]):
- self.device_backend_map[device] = Backend.default_device_backend_map[device]
+ for device, default_backend in Backend.default_device_backend_map.items():
+ if is_backend_available(default_backend):
+ if default_backend == Backend.NCCL and not torch.cuda.is_available():
+ continue
+ self.device_backend_map[device] = default_backend
elif backend.lower() in Backend.backend_list:
# Cases for when backend is a single string (without device types)
# e.g. "nccl", "gloo", "ucc", "mpi"
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index a717c875e76..b382ba760f4 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1775,7 +1775,7 @@ class ProcessGroupWithDispatchedCollectivesTests(MultiProcessTestCase):
if not dist.is_mpi_available():
continue
elif backend == dist.Backend.NCCL:
- if not dist.is_nccl_available():
+ if not dist.is_nccl_available() or not torch.cuda.is_available():
continue
elif backend == dist.Backend.GLOO:
if not dist.is_gloo_available():
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
test_dtensor_op_db_nn_functional_pad_circular_cpu_float32 may unexpectatly succeed, just skip it.
Failure is expected until https://github.com/pytorch/pytorch/commit/9378a2ceda8
test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32 fails with
> NotImplementedError: Operator aten.constant_pad_nd.default does not have a sharding strategy registered.
Marked xfail in https://github.com/pytorch/pytorch/commit/49d826bcd3de952eb84a33c89ed399a1a2821c15
test_dtensor_op_db_empty_strided_cpu_float32 doesn't make sense to run in the first place,
see https://github.com/pytorch/pytorch/issues/118094

Author: Alexander Grund (TU Dresden)

diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py
index b7d453e56be..5a27c7f84da 100644
--- a/test/distributed/_tensor/test_dtensor_ops.py
+++ b/test/distributed/_tensor/test_dtensor_ops.py
@@ -147,6 +147,7 @@ dtensor_fails = {
xfail("dot"),
xfail("einsum"),
xfail("empty"),
+ skip("empty_strided"),
xfail("empty_like"),
xfail("empty_permuted"),
xfail("exponential"),
@@ -359,11 +360,12 @@ dtensor_fails = {
xfail("nn.functional.mish"),
xfail("nn.functional.mse_loss"),
xfail("nn.functional.multi_margin_loss"),
+ skip("nn.functional.multi_head_attention_forward"),
xfail("nn.functional.multilabel_margin_loss"),
xfail("nn.functional.multilabel_soft_margin_loss"),
xfail("nn.functional.nll_loss"),
xfail("nn.functional.normalize"),
- xfail("nn.functional.pad", "circular"),
+ skip("nn.functional.pad", "circular"),
xfail("nn.functional.pad", "constant"),
xfail("nn.functional.pad", "reflect"),
xfail("nn.functional.pad", "replicate"),

0 comments on commit c2fb50b

Please sign in to comment.