Add compile fixture to test integration ops with compile mode (#387)

NVIDIA · Nov 21, 2024 · 259ebf3 · 259ebf3
1 parent 4f8fd90
commit 259ebf3
Show file tree

Hide file tree

Showing 33 changed files with 317 additions and 195 deletions.
diff --git a/tripy/tests/integration/conftest.py b/tripy/tests/integration/conftest.py
@@ -0,0 +1,61 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pytest
+
+import tripy as tp
+
+
+@pytest.fixture(params=["compile", "eager"])
+def eager_or_compiled(request):
+    def wrapper(func, *args, **kwargs):
+        def get_input_info(x: tp.Tensor):
+            return tp.InputInfo(list(map(int, x.shape)), dtype=x.dtype)
+
+        if request.param == "eager":
+            return func(*args, **kwargs)
+
+        assert request.param == "compile"
+
+        compile_args = []
+        for arg in args:
+            # We don't want to feed DimensionSize as a dynamic input to the compiler (https://github.com/NVIDIA/TensorRT-Incubator/issues/65).
+            if isinstance(arg, tp.Tensor) and not isinstance(arg, tp.DimensionSize):
+                compile_args.append(get_input_info(arg))
+            else:
+                compile_args.append(arg)
+        compile_args = tuple(compile_args)
+
+        compile_kwargs = dict(
+            (
+                k,
+                ((get_input_info(v) if isinstance(v, tp.Tensor) and not isinstance(v, tp.DimensionSize) else v)),
+            )
+            for k, v in kwargs.items()
+        )
+
+        compiled_func = tp.compile(func, args=compile_args, kwargs=compile_kwargs)
+
+        tensor_args = tuple(x for x in args if isinstance(x, tp.Tensor) and not isinstance(x, tp.DimensionSize))
+
+        tensor_kwargs = {
+            k: v for k, v in kwargs.items() if isinstance(v, tp.Tensor) and not isinstance(v, tp.DimensionSize)
+        }
+
+        return compiled_func(*tensor_args, **tensor_kwargs)
+
+    return wrapper
diff --git a/tripy/tests/integration/test_batchnorm.py b/tripy/tests/integration/test_batchnorm.py
@@ -26,7 +26,7 @@ class TestBatchNorm:
 
     @pytest.mark.parametrize("torch_dtype, tp_dtype", DTYPES)
     @pytest.mark.parametrize("input_shape", [(2, 2, 2, 2)])
-    def test_batchnorm_accuracy(self, torch_dtype, tp_dtype, input_shape):
+    def test_batchnorm_accuracy(self, torch_dtype, tp_dtype, input_shape, eager_or_compiled):
         eps = 1e-5
         num_features = input_shape[1]  # Number of channels in the input tensor
         batchnorm = torch.nn.BatchNorm2d(num_features=num_features, eps=eps, dtype=torch_dtype)
@@ -45,7 +45,7 @@ def test_batchnorm_accuracy(self, torch_dtype, tp_dtype, input_shape):
         input = torch.randn(input_shape, dtype=torch_dtype).to("cuda")
         tp_input = tp.Tensor(input, dtype=tp_dtype)
 
-        output = tp_batchnorm(tp_input)
+        output = eager_or_compiled(tp_batchnorm, tp_input)
 
         batchnorm.to("cuda").eval()
         with torch.no_grad():

diff --git a/tripy/tests/integration/test_cast.py b/tripy/tests/integration/test_cast.py
@@ -30,54 +30,53 @@ class TestCast:
         [
             (np.int32, np.float32),
             (np.float32, np.int32),
-            (np.int64, np.float32),
-            (np.float32, np.int64),
-            (np.int64, np.int32),
-            (np.int64, np.int8),
             (np.int32, np.int8),
             (np.float32, np.int8),
-            (np.int8, np.int64),
             (np.int8, np.int32),
             (np.int8, np.float32),
             # important to test conversion into bool because default StableHLO semantics
             # are simply to truncate to i1, which is not desirable
             (np.float32, bool),
             (np.int32, bool),
-            (np.int64, bool),
             # requires a dequantization first
             # TODO(#219): Dequantize fails with dynamic shapes
             # (np.int8, bool),
         ],
     )
-    def test_cast(self, input_dtype, target_dtype):
+    def test_cast(self, input_dtype, target_dtype, eager_or_compiled):
         tp_input_dtype = NUMPY_TO_TRIPY[input_dtype]
         tp_target_dtype = NUMPY_TO_TRIPY[target_dtype]
 
         # TODO(#222): Integer casts with negative numbers fail in many cases
         input_tensor = tp.Tensor([0, 1, 2], dtype=tp_input_dtype)
         np_input = cp.from_dlpack(input_tensor).get()
-        output = tp.cast(input_tensor, tp_target_dtype)
+        output = eager_or_compiled(tp.cast, input_tensor, tp_target_dtype)
 
         assert np.array_equal(cp.from_dlpack(output).get(), np_input.astype(target_dtype))
 
     # these dtypes don't have analogues in numpy
     @pytest.mark.parametrize("source_dtype", [pytest.param(tp.float8, marks=skip_if_older_than_sm89), tp.int4])
-    def test_cast_quantized_dtypes_into_bool(self, source_dtype):
+    def test_cast_quantized_dtypes_into_bool(self, source_dtype, eager_or_compiled):
         # TODO(#223): Using an odd size leads to a strange crash, so can't just use [-1.0, 0.0, 1.0]
         input_tensor = tp.Tensor([-1.0, 0.0, 0.0, 1.0], dtype=tp.float32)
-        q = tp.quantize(input_tensor, scale=1.0, dtype=source_dtype)
-        output = tp.cast(q, tp.bool)
+
+        def func(input):
+            q = tp.quantize(input, scale=1.0, dtype=source_dtype)
+            output = tp.cast(q, tp.bool)
+            return output
+
+        output = eager_or_compiled(func, input_tensor)
         assert cp.from_dlpack(output).get().tolist() == [True, False, False, True]
 
-    @pytest.mark.parametrize("target_dtype", [np.float32, np.int32, np.int64, np.int8])
-    def test_cast_from_bool(self, target_dtype):
+    @pytest.mark.parametrize("target_dtype", [np.float32, np.int32, np.int8])
+    def test_cast_from_bool(self, target_dtype, eager_or_compiled):
         tp_target_dtype = NUMPY_TO_TRIPY[target_dtype]
 
         # in principle, it is not important what *specific* values we convert to,
         # so long as false is mapped to 0 and true to nonzero
         input_tensor = tp.Tensor([False, True], dtype=tp.bool)
         np_input = cp.from_dlpack(input_tensor).get()
-        output = tp.cast(input_tensor, tp_target_dtype)
+        output = eager_or_compiled(tp.cast, input_tensor, tp_target_dtype)
 
         tp_compare_to_zero = cp.from_dlpack(output).get() == 0
         np_compare_to_zero = np_input.astype(target_dtype) == 0

diff --git a/tripy/tests/integration/test_concatenate.py b/tripy/tests/integration/test_concatenate.py
@@ -33,9 +33,9 @@ class TestConcatenate:
             ([(2, 3, 4)], 0),
         ],
     )
-    def test_concat(self, tensor_shapes, dim):
+    def test_concat(self, tensor_shapes, dim, eager_or_compiled):
         tensors = [tp.ones(shape) for shape in tensor_shapes]
-        out = tp.concatenate(tensors, dim=dim)
+        out = eager_or_compiled(tp.concatenate, tensors, dim=dim)
         assert np.array_equal(
             cp.from_dlpack(out).get(), np.concatenate([np.ones(shape) for shape in tensor_shapes], axis=dim)
         )
@@ -44,8 +44,8 @@ def test_concat(self, tensor_shapes, dim):
         "tensor_shapes, dim",
         [([(2, 3, 4), (2, 4, 4)], 0), ([(4, 5, 6), (4, 1, 6)], -1)],
     )
-    def test_negative_concat(self, tensor_shapes, dim):
+    def test_negative_concat(self, tensor_shapes, dim, eager_or_compiled):
         tensors = [tp.ones(shape) for shape in tensor_shapes]
         with helper.raises(tp.TripyException, match=f"not compatible at non-concat index"):
-            out = tp.concatenate(tensors, dim=dim)
+            out = eager_or_compiled(tp.concatenate, tensors, dim=dim)
             print(out)
diff --git a/tripy/tests/integration/test_conv.py b/tripy/tests/integration/test_conv.py
@@ -75,7 +75,7 @@ class ConvTestCase:
 @pytest.mark.parametrize("torch_dtype,tp_dtype", DTYPES)
 class TestConvolution:
     @pytest.mark.parametrize("test_case", test_cases_1d)
-    def test_convolution_1d(self, torch_dtype, tp_dtype, test_case):
+    def test_convolution_1d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         if not test_case.torch_pad:
             test_case.torch_pad = 0
         if not test_case.stride:
@@ -122,7 +122,7 @@ def test_convolution_1d(self, torch_dtype, tp_dtype, test_case):
             conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         # FP32 kernel seems to lose some precision, and FP16 needs to be run in FP32 on torch
         rtol_ = 4e-5 if tp_dtype == tp.float32 else 1e-3
@@ -131,7 +131,7 @@ def test_convolution_1d(self, torch_dtype, tp_dtype, test_case):
         assert list(output_torch.shape) == list(expected.shape)
 
     @pytest.mark.parametrize("test_case", test_cases_2d)
-    def test_convolution_2d(self, torch_dtype, tp_dtype, test_case):
+    def test_convolution_2d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         if not test_case.torch_pad:
             test_case.torch_pad = 0
         if not test_case.stride:
@@ -178,15 +178,15 @@ def test_convolution_2d(self, torch_dtype, tp_dtype, test_case):
             conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         rtol_ = 2e-7 if tp_dtype == tp.float32 else 1.5e-3
         output_torch = torch.from_dlpack(output)
         assert torch.allclose(output_torch, expected, rtol=rtol_)
         assert list(output_torch.shape) == list(expected.shape)
 
     @pytest.mark.parametrize("test_case", test_cases_3d)
-    def test_convolution_3d(self, torch_dtype, tp_dtype, test_case):
+    def test_convolution_3d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         pytest.skip("TODO (#260): Fix accuracy bugs in 3D conv")
         if not test_case.torch_pad:
             test_case.torch_pad = 0
@@ -245,14 +245,14 @@ def test_convolution_3d(self, torch_dtype, tp_dtype, test_case):
             return
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         rtol_ = 2e-4 if tp_dtype == tp.float32 else 1.4e-3  # 3d conv has greater accumulation error
         output_torch = torch.from_dlpack(output)
         assert torch.allclose(output_torch, expected, rtol=rtol_)
         assert list(output_torch.shape) == list(expected.shape)
 
-    def test_uneven_padding(self, torch_dtype, tp_dtype):
+    def test_uneven_padding(self, torch_dtype, tp_dtype, eager_or_compiled):
         input_torch = torch.arange(200, dtype=torch.float32, device=torch.device("cuda")).reshape(*(2, 4, 5, 5))
         input = tp.cast(tp.Tensor(input_torch), tp_dtype)
 
@@ -282,7 +282,7 @@ def test_uneven_padding(self, torch_dtype, tp_dtype):
 
         input_torch = torch_pad(input_torch)
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         rtol_ = 2e-7 if tp_dtype == tp.float32 else 2e-3
         output_torch = torch.from_dlpack(output)

diff --git a/tripy/tests/integration/test_conv_transpose.py b/tripy/tests/integration/test_conv_transpose.py
@@ -81,7 +81,7 @@ class ConvTestCase:
 @pytest.mark.parametrize("torch_dtype,tp_dtype", DTYPES)
 class TestConvolution:
     @pytest.mark.parametrize("test_case", test_cases_transpose_1d)
-    def test_transposed_convolution_1d(self, torch_dtype, tp_dtype, test_case):
+    def test_transposed_convolution_1d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         if not test_case.torch_pad:
             test_case.torch_pad = 0
         if not test_case.stride:
@@ -129,14 +129,14 @@ def test_transposed_convolution_1d(self, torch_dtype, tp_dtype, test_case):
             conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
-        rtol_ = 1e-3
+        rtol_ = 3e-3
         assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_)
         assert output.shape == list(expected.shape)
 
     @pytest.mark.parametrize("test_case", test_cases_transpose_2d)
-    def test_transposed_convolution_2d(self, torch_dtype, tp_dtype, test_case):
+    def test_transposed_convolution_2d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         if not test_case.torch_pad:
             test_case.torch_pad = 0
         if not test_case.stride:
@@ -184,14 +184,14 @@ def test_transposed_convolution_2d(self, torch_dtype, tp_dtype, test_case):
             conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         rtol_ = 1e-2
         assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_)
         assert output.shape == list(expected.shape)
 
     @pytest.mark.parametrize("test_case", test_cases_transpose_3d)
-    def test_transposed_convolution_3d(self, torch_dtype, tp_dtype, test_case):
+    def test_transposed_convolution_3d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         if not test_case.torch_pad:
             test_case.torch_pad = 0
         if not test_case.stride:
@@ -239,12 +239,12 @@ def test_transposed_convolution_3d(self, torch_dtype, tp_dtype, test_case):
             conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
         rtol_ = 1.3e-6 if tp_dtype == tp.float32 else 1.6e-3
         assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_)
         assert output.shape == list(expected.shape)
 
-    def test_transposed_equivalency(self, torch_dtype, tp_dtype):
+    def test_transposed_equivalency(self, torch_dtype, tp_dtype, eager_or_compiled):
         input_torch = torch.arange(9, dtype=torch.float32, device=torch.device("cuda")).reshape(*(1, 1, 3, 3))
         input = tp.cast(tp.Tensor(input_torch), tp_dtype)
 
@@ -277,8 +277,8 @@ def test_transposed_equivalency(self, torch_dtype, tp_dtype):
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
         expected_transpose = conv_transpose_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
-        output_transpose = conv_transpose_layer(input)
+        output = eager_or_compiled(conv_layer, input)
+        output_transpose = eager_or_compiled(conv_transpose_layer, input)
 
         rtol_ = 2e-7 if tp_dtype == tp.float32 else 9e-4
         assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_)
@@ -291,7 +291,7 @@ def test_transposed_equivalency(self, torch_dtype, tp_dtype):
         assert list(expected.shape) == list(expected_transpose.shape)
 
     @pytest.mark.parametrize("test_case", test_cases_transpose_downscale)
-    def test_transposed_downscale(self, torch_dtype, tp_dtype, test_case):
+    def test_transposed_downscale(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         input_torch = torch.arange(9, dtype=torch.float32, device=torch.device("cuda")).reshape(*(1, 1, 3, 3))
         input = tp.cast(tp.Tensor(input_torch), tp_dtype)
 
@@ -320,7 +320,7 @@ def test_transposed_downscale(self, torch_dtype, tp_dtype, test_case):
         conv_layer.weight = tp.cast(tp.Tensor(conv_layer_torch.weight.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         rtol_ = 1e-15 if tp_dtype == tp.float32 else 1e-10
         assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_)

diff --git a/tripy/tests/integration/test_cumsum.py b/tripy/tests/integration/test_cumsum.py
@@ -30,11 +30,10 @@ class TestCumsum:
             ([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], 0, [[[1, 2], [3, 4]], [[6, 8], [10, 12]]]),
         ],
     )
-    def test_cumsum(self, data, dim, expected):
+    def test_cumsum(self, data, dim, expected, eager_or_compiled):
         inp = tp.Tensor(data, dtype=tp.float32)
 
-        out = tp.cumsum(inp, dim=dim)
-
+        out = eager_or_compiled(tp.cumsum, inp, dim=dim)
         expected = tp.Tensor(expected, dtype=tp.float32)
         assert tp.allclose(out, expected)
         assert out.shape == expected.shape
diff --git a/tripy/tests/integration/test_dequantize.py b/tripy/tests/integration/test_dequantize.py
@@ -29,28 +29,36 @@ class TestDequantize:
     @pytest.mark.parametrize(
         "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)]
     )
-    def test_dequantize_int8_per_tensor(self, dtype):
+    def test_dequantize_int8_per_tensor(self, dtype, eager_or_compiled):
         data = [4, 8]
         input_tp = tp.Tensor(data, dtype=tp.int8)
         scale = torch.tensor(0.5, dtype=TORCH_DTYPES[dtype])
         scale_tp = tp.Tensor(scale, dtype=dtype)
-        dequantized = tp.dequantize(input_tp, scale_tp, dtype)
+
+        def func(input):
+            return tp.dequantize(input, scale_tp, dtype)
+
+        dequantized = eager_or_compiled(func, input_tp)
         expected = torch.tensor(data) * scale
         output = torch.from_dlpack(dequantized)
         assert torch.allclose(expected, output.to("cpu"))
 
     @pytest.mark.parametrize(
         "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)]
     )
-    def test_dequantize_int8_per_channel(self, dtype):
+    def test_dequantize_int8_per_channel(self, dtype, eager_or_compiled):
         # TODO: Fix in #153
         if dtype == tp.float16:
             pytest.skip("TRT does not support fp16->int8 per-channel dequant.")
         data = [[4, 8], [4, 8]]
         input_tp = tp.Tensor(data, dtype=tp.int8)
         scale = torch.tensor([0.8, 0.9], dtype=TORCH_DTYPES[dtype])
         scale_tp = tp.Tensor(scale, dtype=dtype)
-        dequantized = tp.dequantize(input_tp, scale_tp, dtype, dim=0)
+
+        def func(input):
+            return tp.dequantize(input, scale_tp, dtype, dim=0)
+
+        dequantized = eager_or_compiled(func, input_tp)
         expected = torch.tensor(data) * scale.reshape((2, 1))
         output = torch.from_dlpack(dequantized)
         assert torch.allclose(expected, output.to("cpu"))