Add softmin, hardshrink, hardtanh (#1180)

--------- Co-authored-by: Nikhil Mehta <nikmehta@tesla.com>
2025-12-14 00:39:06 +08:00 · 2024-06-04 15:48:18 -07:00
parent 83b11bc58d
commit 0b7d71fd2f
14 changed files with 110 additions and 20 deletions
--- a/benchmarks/python/compile_bench.py
+++ b/benchmarks/python/compile_bench.py
@@ -9,7 +9,6 @@ from time_utils import time_fn
 def bench_gelu():
    def gelu(x):
        return x * (1 + mx.erf(x / math.sqrt(2))) / 2
@@ -51,7 +50,6 @@ def bench_gelu():
 def bench_layernorm():
    weight = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    bias = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    mx.eval(weight, bias)
--- a/benchmarks/python/conv_bench.py
+++ b/benchmarks/python/conv_bench.py
@@ -54,7 +54,6 @@ def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
 def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
    scale = 1.0 / math.sqrt(kH * kH * C)
    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
--- a/benchmarks/python/fft_bench.py
+++ b/benchmarks/python/fft_bench.py
@@ -35,7 +35,6 @@ def run_bench(system_size):
 def time_fft():
    with mx.stream(mx.cpu):
        cpu_bandwidths = run_bench(system_size=int(2**22))
--- a/benchmarks/python/sdpa_bench.py
+++ b/benchmarks/python/sdpa_bench.py
@@ -10,7 +10,6 @@ SEQ_INCREMENT = 50
 def time_self_attention_primitives():
    mx.random.seed(3)
    B = 2
    H = 38
@@ -32,7 +31,6 @@ def time_self_attention_primitives():
 def time_self_attention_sdpa():
    mx.random.seed(3)
    B = 2
    H = 38
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -17,6 +17,8 @@ simple functions.
   gelu_approx
   gelu_fast_approx
   glu
   hard_shrink
   hard_tanh
   hardswish
   leaky_relu
   log_sigmoid
@@ -29,6 +31,7 @@ simple functions.
   sigmoid
   silu
   softmax
   softmin
   softplus
   softshrink
   step
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -21,10 +21,15 @@ Layers
   Dropout3d
   Embedding
   GELU
   GLU
   GroupNorm
   GRU
   HardShrink
   HardTanh
   Hardswish
   InstanceNorm
   LayerNorm
   LeakyReLU
   Linear
   LSTM
   MaxPool1d
@@ -36,13 +41,19 @@ Layers
   QuantizedLinear
   RMSNorm
   ReLU
   ReLU6
   RNN
   RoPE
   SELU
   Sequential
   SiLU
   SinusoidalPositionalEncoding
   Softmin
   Softshrink
   Softsign
   Softmax
   Softplus
   Step
   Tanh
   Transformer
   Upsample
--- a/python/mlx/nn/layers/init.py
+++ b/python/mlx/nn/layers/init.py
@@ -6,7 +6,9 @@ from mlx.nn.layers.activations import (
    GELU,
    GLU,
    SELU,
    HardShrink,
    Hardswish,
    HardTanh,
    LeakyReLU,
    LogSigmoid,
    LogSoftmax,
@@ -17,6 +19,7 @@ from mlx.nn.layers.activations import (
    Sigmoid,
    SiLU,
    Softmax,
    Softmin,
    Softplus,
    Softshrink,
    Softsign,
@@ -28,6 +31,8 @@ from mlx.nn.layers.activations import (
    gelu_approx,
    gelu_fast_approx,
    glu,
    hard_shrink,
    hard_tanh,
    hardswish,
    leaky_relu,
    log_sigmoid,
@@ -40,6 +45,7 @@ from mlx.nn.layers.activations import (
    sigmoid,
    silu,
    softmax,
    softmin,
    softplus,
    softshrink,
    softsign,
--- a/python/mlx/nn/layers/activations.py
+++ b/python/mlx/nn/layers/activations.py
@@ -286,6 +286,38 @@ def hardswish(x):
    return x * mx.minimum(max_x_3, 6) / 6
@partial(mx.compile, shapeless=True)
 def hard_tanh(x, min_val=-1.0, max_val=1.0):
    r"""Applies the HardTanh function.
    Applies :math:`\max(\min(x, \text{max\_val}), \text{min\_val})` element-wise.
    """
    return mx.minimum(mx.maximum(x, min_val), max_val)
@partial(mx.compile, shapeless=True)
 def hard_shrink(x, lambd=0.5):
    r"""Applies the HardShrink activation function.
    .. math::
        \text{hardshrink}(x) = \begin{cases}
        x & \text{if } x > \lambda \\
        x & \text{if } x < -\lambda \\
        0 & \text{otherwise}
        \end{cases}
    """
    return mx.where(mx.abs(x) > lambd, x, 0)
@partial(mx.compile, shapeless=True)
 def softmin(x, axis=-1):
    r"""Applies the Softmin function.
    Applies :math:`\frac{e^{-x_i}}{\sum_j e^{-x_j}}` element-wise.
    """
    return mx.softmax(-x, axis=axis)
 def tanh(x):
    """Applies the hyperbolic tangent function.
@@ -579,3 +611,30 @@ class SELU(Module):
    See :func:`selu` for the functional equivalent.
    """
@_make_activation_module(hard_tanh)
 class HardTanh(Module):
    r"""Applies the HardTanh function.
    See :func:`hard_tanh` for the functional equivalent.
    """
@_make_activation_module(hard_shrink)
 class HardShrink(Module):
    r"""Applies the HardShrink function.
    See :func:`hard_shrink` for the functional equivalent.
    Args:
        lambd: the :math:`\lambda` value for Hardshrink. Default: ``0.5``
    """
@_make_activation_module(softmin)
 class Softmin(Module):
    r"""Applies the Softmin function.
    See :func:`softmin` for the functional equivalent.
    """
--- a/python/tests/test_blas.py
+++ b/python/tests/test_blas.py
@@ -950,7 +950,6 @@ class TestBlas(mlx_tests.MLXTestCase):
                lhs_indices=lhs_indices,
                rhs_indices=rhs_indices,
            ):
                a_np = np.random.normal(size=batch_A + (M, K)).astype(np_dtype)
                b_np = np.random.normal(size=batch_B + (K, N)).astype(np_dtype)
@@ -1066,7 +1065,6 @@ class TestBlas(mlx_tests.MLXTestCase):
        self.assertTrue(np.allclose(out_np, out_mx, atol=1e-5))
    def test_gather_matmul_grad(self):
        lhs_indices = mx.array([[7, 6], [4, 1], [0, 2]], dtype=mx.uint32)
        rhs_indices = mx.array([[2], [0], [1]], dtype=mx.uint32)
--- a/python/tests/test_compile.py
+++ b/python/tests/test_compile.py
@@ -382,7 +382,6 @@ class TestCompile(mlx_tests.MLXTestCase):
        self.assertFalse(mx.allclose(fun(), fun(), 1e-2, 1e-2))
    def test_compile_kwargs(self):
        @mx.compile
        def fun(x, y, z):
            return x + y + z
@@ -479,7 +478,6 @@ class TestCompile(mlx_tests.MLXTestCase):
        self.assertTrue(mx.array_equal(fun(x2), cfun(x2)))
    def test_compile_with_constant(self):
        # Test float
        @partial(mx.compile)
        def fun(x, y):
@@ -582,7 +580,6 @@ class TestCompile(mlx_tests.MLXTestCase):
        self.assertEqual(counter[0], 2)
    def test_compile_inf(self):
        @mx.compile
        def fun(x):
            return mx.isinf(x + 2)
@@ -591,7 +588,6 @@ class TestCompile(mlx_tests.MLXTestCase):
        self.assertEqual(out.item(), False)
    def test_unsupported_input_types(self):
        class MyClass:
            value = 1
--- a/python/tests/test_conv.py
+++ b/python/tests/test_conv.py
@@ -672,7 +672,6 @@ class TestConv(mlx_tests.MLXTestCase):
        np_dtype=np.float32,
        atol=1e-5,
    ):
        with self.subTest(
            in_shape=in_shape,
            wt_shape=wt_shape,
@@ -684,7 +683,6 @@ class TestConv(mlx_tests.MLXTestCase):
            flip=flip,
            np_dtype=np_dtype,
        ):
            scale = 1.0 / math.sqrt(np.prod(wt_shape[1:]))
            in_np = np.random.normal(0.0, scale, in_shape).astype(np_dtype)
            wt_np = np.random.normal(0.0, scale, wt_shape).astype(np_dtype)
@@ -710,7 +708,6 @@ class TestConv(mlx_tests.MLXTestCase):
            def conv_general_pt(
                inp, wt, stride, padding, kernel_dilation, input_dilation, groups, flip
            ):
                C = inp.size()[1]
                ndim = inp.ndim - 2
                map_ints = lambda x: [x] * ndim if isinstance(x, int) else x
--- a/python/tests/test_fast_sdpa.py
+++ b/python/tests/test_fast_sdpa.py
@@ -15,7 +15,6 @@ def mlx_primitives_sdpa(q, k, v, scale):
 # SDPA for GQA (n_heads > n_kv_heads, n_kv_heads > 1, n_heads % n_kv_heads == 0)
 def mlx_primitives_sdpa_with_gqa(q, k, v, scale):
    n_repeats = q.shape[1] // k.shape[1]
    # borrowing kv cache tiling from mlx-examples/llms/mistral/mistral.py
@@ -34,7 +33,6 @@ def mlx_primitives_sdpa_with_gqa(q, k, v, scale):
 class TestFastSelfAttentionSDPA(mlx_tests.MLXTestCase):
    def test_fast_sdpa(self):
        # Not yet supported:
        # * K pre-transposed in kernel, V pre-transposed in kernel
        np.random.seed(0)
--- a/python/tests/test_metal.py
+++ b/python/tests/test_metal.py
@@ -7,7 +7,6 @@ import mlx_tests
 class TestMetal(mlx_tests.MLXTestCase):
    @unittest.skipIf(not mx.metal.is_available(), "Metal is not available")
    def test_memory_info(self):
        old_limit = mx.metal.set_cache_limit(0)
--- a/python/tests/test_nn.py
+++ b/python/tests/test_nn.py
@@ -55,9 +55,7 @@ class TestBase(mlx_tests.MLXTestCase):
        m.apply_to_modules(assert_training)
    def test_module_attributes(self):
        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.val = None
@@ -806,6 +804,15 @@ class TestLayers(mlx_tests.MLXTestCase):
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)
    def test_softmin(self):
        x = mx.array([1.0, 2.0, 3.0])
        y = nn.softmin(x)
        epsilon = 1e-4
        expected_y = mx.array([0.6652, 0.2447, 0.0900])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)
    def test_softplus(self):
        x = mx.array([1.0, -1.0, 0.0])
        y = nn.softplus(x)
@@ -899,6 +906,28 @@ class TestLayers(mlx_tests.MLXTestCase):
        out = nn.glu(x)
        self.assertEqualArray(out, y)
    def test_hard_tanh(self):
        x = mx.array([1.0, -2.0, 0.0, 0.5, 2.0])
        y = nn.hard_tanh(x)
        expected_y = mx.array([1.0, -1.0, 0.0, 0.5, 1.0])
        self.assertTrue(mx.array_equal(y, expected_y))
        self.assertEqual(y.shape, (5,))
        self.assertEqual(y.dtype, mx.float32)
    def test_hard_shrink(self):
        x = mx.array([1.0, -0.5, 0.0, 0.5, -1.5])
        y = nn.hard_shrink(x)
        expected_y = mx.array([1.0, -0.5, 0.0, 0.5, -1.5])
        self.assertTrue(mx.array_equal(y, expected_y))
        self.assertEqual(y.shape, (5,))
        self.assertEqual(y.dtype, mx.float32)
        y = nn.hard_shrink(x, lambd=1.0)
        expected_y = mx.array([1.0, 0.0, 0.0, 0.0, -1.5])
        self.assertTrue(mx.array_equal(y, expected_y))
        self.assertEqual(y.shape, (5,))
        self.assertEqual(y.dtype, mx.float32)
    def test_rope(self):
        for kwargs in [{}, {"traditional": False}, {"base": 10000}, {"scale": 0.25}]:
            rope = nn.RoPE(4, **kwargs)