forked from easybuilders/easybuild-easyconfigs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from Flamefire/pr-to-jfgrim-pytorch-cuda-2.1.2
Enhance PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb
- Loading branch information
Showing
8 changed files
with
278 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
59 changes: 59 additions & 0 deletions
59
easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_add-cuda-skip-markers.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
distributed/test_inductor_collectives & distributed/test_dynamo_distributed fail when run without GPUs | ||
with "RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!" | ||
Skip those in that case. | ||
See https://github.com/pytorch/pytorch/pull/117741 | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py | ||
index c8fae62bd62..6220b62a9a9 100644 | ||
--- a/test/distributed/test_dynamo_distributed.py | ||
+++ b/test/distributed/test_dynamo_distributed.py | ||
@@ -30,6 +30,7 @@ from torch.testing._internal.common_distributed import ( | ||
requires_nccl, | ||
_dynamo_dist_per_rank_init, | ||
) | ||
+from torch.testing._internal.common_utils import requires_cuda | ||
import torch._dynamo.logging | ||
from torch._dynamo.comptime import comptime | ||
|
||
@@ -452,6 +453,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): | ||
|
||
|
||
@requires_nccl() | ||
+@requires_cuda | ||
class TestSingleProc(DynamoDistributedSingleProcTestCase): | ||
""" | ||
Test harness initializes dist process group. | ||
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py | ||
index 9183e2e9ce4..37149865fd9 100644 | ||
--- a/test/distributed/test_inductor_collectives.py | ||
+++ b/test/distributed/test_inductor_collectives.py | ||
@@ -19,6 +19,7 @@ from torch.testing._internal.common_distributed import ( | ||
requires_nccl, | ||
skip_if_lt_x_gpu, | ||
) | ||
+from torch.testing._internal.common_utils import requires_cuda | ||
from torch._inductor.compile_fx import compile_fx as inductor_compile_fx | ||
from torch._inductor.utils import has_triton, run_and_get_triton_code | ||
import torch._dynamo.logging | ||
@@ -216,6 +217,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): | ||
|
||
|
||
@requires_nccl() | ||
+@requires_cuda | ||
class TestCollectivesInductor(DynamoDistributedSingleProcTestCase): | ||
""" | ||
Prefer single-proc test runner for basic tests as it is easier to work with. | ||
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py | ||
index 1e18ca2afec..bad5af5212f 100644 | ||
--- a/torch/testing/_internal/common_utils.py | ||
+++ b/torch/testing/_internal/common_utils.py | ||
@@ -1111,6 +1111,7 @@ if TEST_CUDA and 'NUM_PARALLEL_PROCS' in os.environ: | ||
# other libraries take up about 11% of space per process | ||
torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .11, 2)) | ||
|
||
+requires_cuda = unittest.skipUnless(torch.cuda.is_available(), "Requires CUDA") | ||
|
||
def skipIfCrossRef(fn): | ||
@wraps(fn) |
66 changes: 66 additions & 0 deletions
66
easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
In test_ops.py the tests test_python_ref_meta__refs_linalg_svd_cpu_complex* fail | ||
when PyTorch is compiled with CUDA support with | ||
> RuntimeError: Conj mismatch! is_conj is set to False and True | ||
|
||
This is a known issue (https://github.com/pytorch/pytorch/issues/105068) | ||
so don't check the flag in the test. | ||
|
||
diff --git a/test/test_ops.py b/test/test_ops.py | ||
index 08a8185d346..dc98e7c7439 100644 | ||
--- a/test/test_ops.py | ||
+++ b/test/test_ops.py | ||
@@ -302,6 +302,10 @@ class TestCommon(TestCase): | ||
@ops(python_ref_db) | ||
@skipIfTorchInductor("Takes too long for inductor") | ||
def test_python_ref_meta(self, device, dtype, op): | ||
+ CHECK_CONJ_SKIPS = { | ||
+ torch._refs.linalg.svd, | ||
+ } | ||
+ | ||
with FakeTensorMode() as mode: | ||
pass | ||
|
||
@@ -328,12 +332,12 @@ class TestCommon(TestCase): | ||
|
||
if isinstance(result, torch.Tensor): | ||
self.assertTrue(isinstance(meta_result, FakeTensor)) | ||
- prims.utils.compare_tensor_meta(result, meta_result) | ||
+ prims.utils.compare_tensor_meta(result, meta_result, check_conj=op.op not in CHECK_CONJ_SKIPS) | ||
elif isinstance(result, Sequence): | ||
for a, b in zip(result, meta_result): | ||
if isinstance(a, torch.Tensor) or isinstance(b, torch.Tensor): | ||
self.assertTrue(isinstance(b, FakeTensor)) | ||
- prims.utils.compare_tensor_meta(a, b) | ||
+ prims.utils.compare_tensor_meta(a, b, check_conj=op.op not in CHECK_CONJ_SKIPS) | ||
|
||
def _ref_test_helper( | ||
self, | ||
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py | ||
index d60931162da..da217470930 100644 | ||
--- a/torch/_prims_common/__init__.py | ||
+++ b/torch/_prims_common/__init__.py | ||
@@ -90,7 +90,7 @@ def same_shape(a: ShapeType, b: ShapeType) -> bool: | ||
|
||
# TODO: look at using torch.testing.assert_close instead with an option | ||
# to just compare metadata | ||
-def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=False): | ||
+def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=False, check_conj=True): | ||
""" | ||
Checks that two tensor likes have the same shape, | ||
dtype and device. | ||
@@ -131,10 +131,11 @@ def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=Fals | ||
msg = f"Storage offset mismatch! Storage offsets are {a.storage_offset()} and {b.storage_offset()}!" | ||
raise RuntimeError(msg) | ||
|
||
- if a.is_conj() != b.is_conj(): | ||
- raise RuntimeError( | ||
- f"Conj mismatch! is_conj is set to {a.is_conj()} and {b.is_conj()}" | ||
- ) | ||
+ if check_conj: | ||
+ if a.is_conj() != b.is_conj(): | ||
+ raise RuntimeError( | ||
+ f"Conj mismatch! is_conj is set to {a.is_conj()} and {b.is_conj()}" | ||
+ ) | ||
|
||
if a.is_neg() != b.is_neg(): | ||
raise RuntimeError( |
22 changes: 22 additions & 0 deletions
22
easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-device-mesh-check.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Fix error when there are more GPUs than ranks: | ||
> RuntimeError: DeviceMesh only support homogeneous hardware, but found 4 ranks and 8 cuda devices! | ||
|
||
See https://github.com/pytorch/pytorch/pull/111091 | ||
|
||
diff --git a/torch/distributed/_tensor/device_mesh.py b/torch/distributed/_tensor/device_mesh.py | ||
index b5e30eeca82..21ba82503a8 100644 | ||
--- a/torch/distributed/_tensor/device_mesh.py | ||
+++ b/torch/distributed/_tensor/device_mesh.py | ||
@@ -165,7 +165,10 @@ class DeviceMesh: | ||
# automatically set the current cuda/cuda-like device base on num of gpu devices available in each host | ||
# NOTE: This device selection would only work for homogeneous hardware. | ||
num_devices_per_host = device_handle.device_count() | ||
- if world_size % num_devices_per_host != 0: | ||
+ if ( | ||
+ world_size > num_devices_per_host | ||
+ and world_size % num_devices_per_host != 0 | ||
+ ): | ||
raise RuntimeError( | ||
f"DeviceMesh only support homogeneous hardware, but found " | ||
f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!" | ||
|
22 changes: 22 additions & 0 deletions
22
easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_relax-cuda-tolerances.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
test_jit fails in test_freeze_conv_relu_fusion with | ||
Mismatched elements: 7 / 30 (23.3%) | ||
Greatest absolute difference: 3.053247928619385e-05 at index (1, 1, 0, 0, 0) (up to 1e-05 allowed) | ||
Greatest relative difference: 0.0004548609140329063 at index (3, 1, 0, 0, 0) (up to 1.3e-06 allowed) | ||
|
||
Increase the tolerance to allow this to pass. | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py | ||
index c8c1441adbf..e0feffd6bb5 100644 | ||
--- a/test/jit/test_freezing.py | ||
+++ b/test/jit/test_freezing.py | ||
@@ -2733,7 +2733,7 @@ class TestFrozenOptimizations(JitTestCase): | ||
else: | ||
FileCheck().check("aten::cudnn_convolution_relu").run(frozen_mod.graph) | ||
|
||
- self.assertEqual(mod_eager(inp), frozen_mod(inp)) | ||
+ self.assertEqual(mod_eager(inp), frozen_mod(inp), atol=1e-4, rtol=4e-3) | ||
|
||
@unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN") | ||
def test_freeze_conv_relu_fusion_not_forward(self): |
41 changes: 41 additions & 0 deletions
41
easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
In some code paths the ProcessGroupNCCL is created when PyTorch was compiled with NCCL. | ||
However without any GPUs present at runtime the creation will fail with | ||
> RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found! | ||
|
||
Remove NCCL as a available default backend if CUDA isn't available. | ||
See https://github.com/pytorch/pytorch/issues/117746 | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py | ||
index 098e209264c..80962466bff 100644 | ||
--- a/torch/distributed/distributed_c10d.py | ||
+++ b/torch/distributed/distributed_c10d.py | ||
@@ -271,9 +271,11 @@ class BackendConfig: | ||
if backend == Backend.UNDEFINED: | ||
# default config when backend is not specified | ||
# supported since PyTorch 2.0 | ||
- for device in Backend.default_device_backend_map: | ||
- if is_backend_available(Backend.default_device_backend_map[device]): | ||
- self.device_backend_map[device] = Backend.default_device_backend_map[device] | ||
+ for device, default_backend in Backend.default_device_backend_map.items(): | ||
+ if is_backend_available(default_backend): | ||
+ if default_backend == Backend.NCCL and not torch.cuda.is_available(): | ||
+ continue | ||
+ self.device_backend_map[device] = default_backend | ||
elif backend.lower() in Backend.backend_list: | ||
# Cases for when backend is a single string (without device types) | ||
# e.g. "nccl", "gloo", "ucc", "mpi" | ||
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py | ||
index a717c875e76..b382ba760f4 100644 | ||
--- a/test/distributed/test_c10d_common.py | ||
+++ b/test/distributed/test_c10d_common.py | ||
@@ -1775,7 +1775,7 @@ class ProcessGroupWithDispatchedCollectivesTests(MultiProcessTestCase): | ||
if not dist.is_mpi_available(): | ||
continue | ||
elif backend == dist.Backend.NCCL: | ||
- if not dist.is_nccl_available(): | ||
+ if not dist.is_nccl_available() or not torch.cuda.is_available(): | ||
continue | ||
elif backend == dist.Backend.GLOO: | ||
if not dist.is_gloo_available(): |
36 changes: 36 additions & 0 deletions
36
easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
test_dtensor_op_db_nn_functional_pad_circular_cpu_float32 may unexpectatly succeed, just skip it. | ||
Failure is expected until https://github.com/pytorch/pytorch/commit/9378a2ceda8 | ||
test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32 fails with | ||
> NotImplementedError: Operator aten.constant_pad_nd.default does not have a sharding strategy registered. | ||
Marked xfail in https://github.com/pytorch/pytorch/commit/49d826bcd3de952eb84a33c89ed399a1a2821c15 | ||
test_dtensor_op_db_empty_strided_cpu_float32 doesn't make sense to run in the first place, | ||
see https://github.com/pytorch/pytorch/issues/118094 | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py | ||
index b7d453e56be..5a27c7f84da 100644 | ||
--- a/test/distributed/_tensor/test_dtensor_ops.py | ||
+++ b/test/distributed/_tensor/test_dtensor_ops.py | ||
@@ -147,6 +147,7 @@ dtensor_fails = { | ||
xfail("dot"), | ||
xfail("einsum"), | ||
xfail("empty"), | ||
+ skip("empty_strided"), | ||
xfail("empty_like"), | ||
xfail("empty_permuted"), | ||
xfail("exponential"), | ||
@@ -359,11 +360,12 @@ dtensor_fails = { | ||
xfail("nn.functional.mish"), | ||
xfail("nn.functional.mse_loss"), | ||
xfail("nn.functional.multi_margin_loss"), | ||
+ skip("nn.functional.multi_head_attention_forward"), | ||
xfail("nn.functional.multilabel_margin_loss"), | ||
xfail("nn.functional.multilabel_soft_margin_loss"), | ||
xfail("nn.functional.nll_loss"), | ||
xfail("nn.functional.normalize"), | ||
- xfail("nn.functional.pad", "circular"), | ||
+ skip("nn.functional.pad", "circular"), | ||
xfail("nn.functional.pad", "constant"), | ||
xfail("nn.functional.pad", "reflect"), | ||
xfail("nn.functional.pad", "replicate"), |