From b5920b7ef47846e4cb37094a89bf49d127b4d802 Mon Sep 17 00:00:00 2001 From: Paul Balanca Date: Fri, 17 Nov 2023 16:03:10 +0000 Subject: [PATCH] Implement `set_scaling` and `stop_scaling` JAX primitives. (#20) * `set_scaling`: Set the scaling of a tensor, transforming into `ScaledArray` in `autoscale` mode. * `stop_scaling`: Stop scale propagation of a tensor, transforming back into a JAX array. Both operations are no-op identity operations in normal JAX mode. Note: as pointed by @DouglasOrr, these primitives could also be formalized as casting operations, where ScaledDtypes are properly defined. To be clarified whether it may be a better setting for the JAX implementation! --- jax_scaled_arithmetics/core/interpreters.py | 4 +- jax_scaled_arithmetics/lax/__init__.py | 1 + .../lax/base_scaling_primitives.py | 115 ++++++++++++++++++ tests/lax/test_base_scaling_primitives.py | 68 +++++++++++ tests/lax/test_scaled_ops.py | 21 ++-- 5 files changed, 200 insertions(+), 9 deletions(-) create mode 100644 jax_scaled_arithmetics/lax/base_scaling_primitives.py create mode 100644 tests/lax/test_base_scaling_primitives.py diff --git a/jax_scaled_arithmetics/core/interpreters.py b/jax_scaled_arithmetics/core/interpreters.py index 91ef749..6e46c13 100644 --- a/jax_scaled_arithmetics/core/interpreters.py +++ b/jax_scaled_arithmetics/core/interpreters.py @@ -113,7 +113,9 @@ def to_scaled_array(val): # Primitive is supported by `autoscale`? if eqn.primitive not in _scaled_ops_registry: - raise NotImplementedError(f"{eqn.primitive} does not have an implementation for ScaledArray inputs yet") + raise NotImplementedError( + f"'{eqn.primitive}' JAX primitive does not have an implementation for ScaledArray inputs yet." + ) outvals = _scaled_ops_registry[eqn.primitive](*invals, **eqn.params) if not eqn.primitive.multiple_results: outvals = [outvals] diff --git a/jax_scaled_arithmetics/lax/__init__.py b/jax_scaled_arithmetics/lax/__init__.py index 65b52cc..148fe69 100644 --- a/jax_scaled_arithmetics/lax/__init__.py +++ b/jax_scaled_arithmetics/lax/__init__.py @@ -1,2 +1,3 @@ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. +from .base_scaling_primitives import set_scaling, set_scaling_p, stop_scaling, stop_scaling_p # noqa: F401 from .scaled_ops import * # noqa: F401, F403 diff --git a/jax_scaled_arithmetics/lax/base_scaling_primitives.py b/jax_scaled_arithmetics/lax/base_scaling_primitives.py new file mode 100644 index 0000000..81f51d8 --- /dev/null +++ b/jax_scaled_arithmetics/lax/base_scaling_primitives.py @@ -0,0 +1,115 @@ +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. +from typing import Optional, Sequence, Union + +import jax +from jax import core +from jax.interpreters import mlir +from jax.interpreters.mlir import LoweringRuleContext, ir + +from jax_scaled_arithmetics.core import DTypeLike, ScaledArray, register_scaled_op + +set_scaling_p = core.Primitive("set_scaling_p") +"""`set_scaling` JAX primitive. + +In standard JAX, this is just an identity operation, ignoring the `scale` +input, just returning unchanged the `data` component. + +In JAX Scaled Arithmetics/AutoScale mode, it will rebalance the data term to +return a ScaledArray semantically equivalent. +""" + + +def set_scaling(values: jax.Array, scale: jax.Array) -> jax.Array: + """`set_scaling` primitive call method.""" + return set_scaling_p.bind(values, scale) + + +def set_scaling_impl(values: jax.Array, scale: jax.Array) -> jax.Array: + return values + + +def set_scaling_abstract_eval(values: core.ShapedArray, scale: core.ShapedArray) -> core.ShapedArray: + return values + + +def set_scaling_mlir_lowering( + ctx: LoweringRuleContext, *args: Union[ir.Value, Sequence[ir.Value]] +) -> Sequence[Union[ir.Value, Sequence[ir.Value]]]: + # Just forwarding `values` term, ignoring the `scale`. + return (args[0],) + + +def scaled_set_scaling(values: ScaledArray, scale: ScaledArray) -> ScaledArray: + """Scaled `set_scaling` implementation: rebalancing the data using the new scale value.""" + assert isinstance(values, ScaledArray) + assert isinstance(scale, ScaledArray) + assert scale.shape == () + # TODO/FIXME: handle not scaled inputs!!! + scale_value = scale.to_array() + # Rebalancing data tensor using the new scale. + data = values.data * (values.scale / scale_value) + return ScaledArray(data, scale_value) + + +# Register as standard JAX primitive +set_scaling_p.multiple_results = False +set_scaling_p.def_abstract_eval(set_scaling_abstract_eval) +set_scaling_p.def_impl(set_scaling_impl) +mlir.register_lowering(set_scaling_p, set_scaling_mlir_lowering) +# Register "scaled" translation. +register_scaled_op(set_scaling_p, scaled_set_scaling) + + +stop_scaling_p = core.Primitive("stop_scaling_p") +"""`stop_scaling` JAX primitive. + +In standard JAX, this is just an identity operation (with optional casting). + +In JAX Scaled Arithmetics/AutoScale mode, it will return the value tensor, +with optional casting. + +Similar in principle to `jax.lax.stop_gradient` +""" + + +def stop_scaling(values: jax.Array, dtype: Optional[DTypeLike] = None) -> jax.Array: + """`stop_scaling` primitive call method.""" + return stop_scaling_p.bind(values, dtype=dtype) + + +def stop_scaling_impl(values: jax.Array, dtype: Optional[DTypeLike]) -> jax.Array: + if dtype is not None: + values = values.astype(dtype) + return values + + +def stop_scaling_abstract_eval(values: core.ShapedArray, dtype: Optional[DTypeLike]) -> core.ShapedArray: + return values.update(dtype=dtype) + + +def stop_scaling_mlir_lowering( + ctx: LoweringRuleContext, *args: Union[ir.Value, Sequence[ir.Value]], **params +) -> Sequence[Union[ir.Value, Sequence[ir.Value]]]: + dtype = params.get("dtype", None) + if dtype is not None: + # TODO: caching of the MLIR lowered function? + stop_scaling_mlir_fn = mlir.lower_fun(lambda x: x.astype(dtype), multiple_results=False) + return stop_scaling_mlir_fn(ctx, *args) + # By default: forward tensor. + return (args[0],) + + +def scaled_stop_scaling(values: ScaledArray, dtype: Optional[DTypeLike] = None) -> jax.Array: + """Scaled `stop_scaling` implementation: returning tensor values (with optional cast).""" + assert isinstance(values, ScaledArray) + # TODO/FIXME: how to handle not scaled input. + return values.to_array(dtype=dtype) + + +# Register as standard JAX primitive +stop_scaling_p.multiple_results = False +stop_scaling_p.def_abstract_eval(stop_scaling_abstract_eval) +stop_scaling_p.def_impl(stop_scaling_impl) +mlir.register_lowering(stop_scaling_p, stop_scaling_mlir_lowering) +# Register "scaled" translation. +register_scaled_op(stop_scaling_p, scaled_stop_scaling) diff --git a/tests/lax/test_base_scaling_primitives.py b/tests/lax/test_base_scaling_primitives.py new file mode 100644 index 0000000..fa5f188 --- /dev/null +++ b/tests/lax/test_base_scaling_primitives.py @@ -0,0 +1,68 @@ +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. +import chex +import jax +import jax.numpy as jnp +import numpy as np +import numpy.testing as npt + +from jax_scaled_arithmetics.core import ScaledArray, autoscale, scaled_array +from jax_scaled_arithmetics.lax import set_scaling, stop_scaling + + +class SetScalingPrimitiveTests(chex.TestCase): + @chex.variants(with_jit=True, without_jit=True) + def test__set_scaling_primitive__proper_result_without_autoscale(self): + def fn(arr, scale): + return set_scaling(arr, scale) + + fn = self.variant(fn) + arr = jnp.array([2, 3], dtype=np.float32) + scale = jnp.array(4, dtype=np.float32) + out = fn(arr, scale) + npt.assert_array_equal(out, arr) + + @chex.variants(with_jit=True, without_jit=True) + def test__set_scaling_primitive__proper_result_with_autoscale(self): + def fn(arr, scale): + return set_scaling(arr, scale) + + fn = self.variant(autoscale(fn)) + arr = scaled_array([-1.0, 2.0], 1.0, dtype=np.float32) + # TODO: support scalar here! + scale = scaled_array(1.0, 4.0, dtype=np.float32) + out = fn(arr, scale) + # Unchanged output tensor! + assert isinstance(out, ScaledArray) + npt.assert_array_equal(out.scale, scale) + npt.assert_array_equal(out, arr) + + +class StopScalingPrimitiveTests(chex.TestCase): + @chex.variants(with_jit=True, without_jit=True) + def test__stop_scaling_primitive__proper_result_without_autoscale(self): + def fn(arr): + # Testing both variants. + return stop_scaling(arr), stop_scaling(arr, dtype=np.float16) + + arr = jnp.array([2, 3], dtype=np.float32) + out0, out1 = self.variant(fn)(arr) + assert out0.dtype == arr.dtype + assert out1.dtype == np.float16 + npt.assert_array_equal(out0, arr) + npt.assert_array_almost_equal(out1, arr) + + @chex.variants(with_jit=True, without_jit=True) + def test__stop_scaling_primitive__proper_result_with_autoscale(self): + def fn(arr): + # Testing both variants. + return stop_scaling(arr), stop_scaling(arr, dtype=np.float16) + + fn = self.variant(autoscale(fn)) + arr = scaled_array([-1.0, 2.0], 3.0, dtype=np.float32) + out0, out1 = fn(arr) + assert isinstance(out0, jax.Array) + assert isinstance(out1, jax.Array) + assert out0.dtype == arr.dtype + assert out1.dtype == np.float16 + npt.assert_array_equal(out0, arr) + npt.assert_array_almost_equal(out1, arr) diff --git a/tests/lax/test_scaled_ops.py b/tests/lax/test_scaled_ops.py index e2ad2f8..f733f05 100644 --- a/tests/lax/test_scaled_ops.py +++ b/tests/lax/test_scaled_ops.py @@ -18,37 +18,42 @@ class ScaledTranslationPrimitivesTests(chex.TestCase): + def setUp(self): + super().setUp() + # Use random state for reproducibility! + self.rs = np.random.RandomState(42) + def test__scaled_broadcast_in_dim__proper_scaling(self): - x = scaled_array(np.random.rand(5), 2, dtype=np.float32) + x = scaled_array(self.rs.rand(5), 2, dtype=np.float32) z = scaled_broadcast_in_dim(x, shape=(5, 1), broadcast_dimensions=(0,)) assert isinstance(z, ScaledArray) npt.assert_array_equal(z.scale, x.scale) npt.assert_array_almost_equal(z.data, x.data.reshape((5, 1))) def test__scaled_concatenate__proper_scaling(self): - x = scaled_array(np.random.rand(2, 3), 0.5, dtype=np.float32) - y = scaled_array(np.random.rand(5, 3), 2, dtype=np.float32) + x = scaled_array(self.rs.rand(2, 3), 0.5, dtype=np.float32) + y = scaled_array(self.rs.rand(5, 3), 2, dtype=np.float32) z = scaled_concatenate([x, y], dimension=0) assert isinstance(z, ScaledArray) npt.assert_array_equal(z.scale, y.scale) npt.assert_array_almost_equal(z, np.concatenate([x, y], axis=0)) def test__scaled_convert_element_type__proper_scaling(self): - x = scaled_array(np.random.rand(5), 2, dtype=np.float32) + x = scaled_array(self.rs.rand(5), 2, dtype=np.float32) z = scaled_convert_element_type(x, new_dtype=np.float16) assert isinstance(z, ScaledArray) npt.assert_array_equal(z.scale, x.scale) npt.assert_array_almost_equal(z.data, x.data.astype(z.dtype)) def test__scaled_transpose__proper_scaling(self): - x = scaled_array(np.random.rand(3, 5), 2, dtype=np.float32) + x = scaled_array(self.rs.rand(3, 5), 2, dtype=np.float32) z = scaled_transpose(x, (1, 0)) assert isinstance(z, ScaledArray) assert z.scale == x.scale npt.assert_array_almost_equal(z.data, x.data.T) def test__scaled_slice__proper_scaling(self): - x = scaled_array(np.random.rand(5), 2, dtype=np.float32) + x = scaled_array(self.rs.rand(5), 2, dtype=np.float32) z = scaled_slice(x, (1,), (4,), (2,)) assert isinstance(z, ScaledArray) assert z.scale == x.scale @@ -81,8 +86,8 @@ def test__scaled_sub__proper_scaling(self): npt.assert_array_almost_equal(z, np.asarray(x) - np.asarray(y)) def test__scaled_dot_general__proper_scaling(self): - lhs = scaled_array(np.random.rand(3, 5), 2.0, dtype=np.float32) - rhs = scaled_array(np.random.rand(5, 2), 3.0, dtype=np.float32) + lhs = scaled_array(self.rs.rand(3, 5), 2.0, dtype=np.float32) + rhs = scaled_array(self.rs.rand(5, 2), 3.0, dtype=np.float32) out = scaled_dot_general(lhs, rhs, (((1,), (0,)), ((), ()))) assert isinstance(out, ScaledArray) assert out.dtype == lhs.dtype