[CUDA] Fix alpha not respected when using bias epilogue (#2578)

2025-12-16 01:49:05 +08:00 · 2025-09-10 09:08:01 +09:00
parent dde3682b69
commit 44cc5da4bc
6 changed files with 146 additions and 125 deletions
--- a/python/tests/test_blas.py
+++ b/python/tests/test_blas.py
@@ -594,124 +594,123 @@ class TestBlas(mlx_tests.MLXTestCase):
        np.random.seed(0)
        # Batched matmul
        alpha = 0.5
-        beta = 2.0
+        for beta in (1.0, 2.0):
+            # c must broadcast to the output shape
+            with self.assertRaises(ValueError):
+                mx.addmm(mx.zeros((2, 2, 2)), mx.zeros((2, 2)), mx.zeros((2, 2)))

-        # c must broadcast to the output shape
-        with self.assertRaises(ValueError):
-            mx.addmm(mx.zeros((2, 2, 2)), mx.zeros((2, 2)), mx.zeros((2, 2)))
+            # Regular batched case
+            a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
+            b_npy = np.random.normal(0.0, 1.0 / 128, (32, 16, 16)).astype(np.float32)

-        # Regular batched case
-        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
-        b_npy = np.random.normal(0.0, 1.0 / 128, (32, 16, 16)).astype(np.float32)
+            a_mlx = mx.array(a_npy)
+            b_mlx = mx.array(b_npy)

-        a_mlx = mx.array(a_npy)
-        b_mlx = mx.array(b_npy)
+            for c_shape in ((1,), (1, 16), (32, 1, 16), (1, 128, 16)):
+                c_npy = np.ones(c_shape).astype(np.float32)
+                c_mlx = mx.array(c_npy)

-        for c_shape in ((1,), (1, 16), (32, 1, 16), (1, 128, 16)):
-            c_npy = np.ones(c_shape).astype(np.float32)
-            c_mlx = mx.array(c_npy)
+                d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
+                d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)

-            d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
-            d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)
+                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
+                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))

-            self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
-            self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))
+            # Batched and transposed matmul
+            b_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
+            b_mlx = mx.array(b_npy)

-        # Batched and transposed matmul
-        b_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
-        b_mlx = mx.array(b_npy)
+            for c_shape in ((1,), (32, 1, 128), (1, 128)):
+                c_npy = np.ones(c_shape).astype(np.float32)
+                c_mlx = mx.array(c_npy)

-        for c_shape in ((1,), (32, 1, 128), (1, 128)):
-            c_npy = np.ones(c_shape).astype(np.float32)
-            c_mlx = mx.array(c_npy)
+                b_np_t = np.transpose(b_npy, (0, 2, 1))
+                b_mx_t = mx.transpose(b_mlx, (0, 2, 1))

-            b_np_t = np.transpose(b_npy, (0, 2, 1))
-            b_mx_t = mx.transpose(b_mlx, (0, 2, 1))
+                d_npy = alpha * (a_npy @ b_np_t) + beta * c_npy
+                d_mlx = mx.addmm(c_mlx, a_mlx, b_mx_t, alpha, beta)

-            d_npy = alpha * (a_npy @ b_np_t) + beta * c_npy
-            d_mlx = mx.addmm(c_mlx, a_mlx, b_mx_t, alpha, beta)
+                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
+                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))
+            # Batched matmul with simple broadcast
+            a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
+            b_npy = np.random.normal(0.0, 1.0 / 128, (16, 16)).astype(np.float32)

-            self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
-            self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))
-        # Batched matmul with simple broadcast
-        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
-        b_npy = np.random.normal(0.0, 1.0 / 128, (16, 16)).astype(np.float32)
+            a_mlx = mx.array(a_npy)
+            b_mlx = mx.array(b_npy)

-        a_mlx = mx.array(a_npy)
-        b_mlx = mx.array(b_npy)
+            for c_shape in ((1,), (1, 16), (32, 1, 16), (1, 128, 16)):
+                c_npy = np.ones(c_shape).astype(np.float32)
+                c_mlx = mx.array(c_npy)

-        for c_shape in ((1,), (1, 16), (32, 1, 16), (1, 128, 16)):
-            c_npy = np.ones(c_shape).astype(np.float32)
-            c_mlx = mx.array(c_npy)
+                d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
+                d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)

-            d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
-            d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)
+                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
+                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))
+            # Matmul with vector
+            a_npy = np.random.normal(0.0, 1.0 / 128, (16,)).astype(np.float32)
+            b_npy = np.random.normal(0.0, 1.0 / 128, (32, 16, 128)).astype(np.float32)
+            a_mlx = mx.array(a_npy)
+            b_mlx = mx.array(b_npy)

-            self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
-            self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))
-        # Matmul with vector
-        a_npy = np.random.normal(0.0, 1.0 / 128, (16,)).astype(np.float32)
-        b_npy = np.random.normal(0.0, 1.0 / 128, (32, 16, 128)).astype(np.float32)
-        a_mlx = mx.array(a_npy)
-        b_mlx = mx.array(b_npy)
+            for c_shape in ((1,), (128,), (32, 128)):
+                c_npy = np.ones(c_shape).astype(np.float32)
+                c_mlx = mx.array(c_npy)

-        for c_shape in ((1,), (128,), (32, 128)):
-            c_npy = np.ones(c_shape).astype(np.float32)
-            c_mlx = mx.array(c_npy)
+                d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
+                d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)

-            d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
-            d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)
+                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
+                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))

-            self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
-            self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))
+            # Matmul with vector
+            a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
+            b_npy = np.random.normal(0.0, 1.0 / 128, (16,)).astype(np.float32)
+            a_mlx = mx.array(a_npy)
+            b_mlx = mx.array(b_npy)

-        # Matmul with vector
-        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
-        b_npy = np.random.normal(0.0, 1.0 / 128, (16,)).astype(np.float32)
-        a_mlx = mx.array(a_npy)
-        b_mlx = mx.array(b_npy)
+            for c_shape in ((1,), (32, 128)):
+                c_npy = np.ones(c_shape).astype(np.float32)
+                c_mlx = mx.array(c_npy)

-        for c_shape in ((1,), (32, 128)):
-            c_npy = np.ones(c_shape).astype(np.float32)
-            c_mlx = mx.array(c_npy)
+                d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
+                d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)

-            d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
-            d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)
+                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
+                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))

-            self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
-            self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))
+            # Split K specializtion
+            a_npy = np.random.normal(0.0, 1.0 / 128, (64, 4096)).astype(np.float32)
+            b_npy = np.random.normal(0.0, 1.0 / 128, (4096, 32)).astype(np.float32)

-        # Split K specializtion
-        a_npy = np.random.normal(0.0, 1.0 / 128, (64, 4096)).astype(np.float32)
-        b_npy = np.random.normal(0.0, 1.0 / 128, (4096, 32)).astype(np.float32)
+            a_mlx = mx.array(a_npy)
+            b_mlx = mx.array(b_npy)

-        a_mlx = mx.array(a_npy)
-        b_mlx = mx.array(b_npy)
+            for c_shape in ((1,), (1, 32), (64, 1), (64, 32)):
+                c_npy = np.ones(c_shape).astype(np.float32)
+                c_mlx = mx.array(c_npy)

-        for c_shape in ((1,), (1, 32), (64, 1), (64, 32)):
-            c_npy = np.ones(c_shape).astype(np.float32)
-            c_mlx = mx.array(c_npy)
+                d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
+                d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)

-            d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
-            d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)
+                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
+                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))

-            self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
-            self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))
+            # Transposed c
+            a = mx.ones((10, 5)).T
+            b = mx.ones((5, 5))
+            out = mx.addmm(a, b, a, beta=beta, alpha=alpha)
+            expected = beta * a + alpha * (b @ a)
+            self.assertTrue(mx.allclose(expected, out))

-        # Transposed c
-        a = mx.ones((10, 5)).T
-        b = mx.ones((5, 5))
-        out = mx.addmm(a, b, a, beta=1.5, alpha=0.5)
-        expected = 1.5 * a + 0.5 * (b @ a)
-        self.assertTrue(mx.allclose(expected, out))
-
-        # Broadcast c
-        a = mx.ones((5, 5))
-        b = mx.ones((5, 5))
-        c = mx.ones((1, 5))
-        out = mx.addmm(c, a, b, beta=1.5, alpha=0.5)
-        expected = 1.5 * c + 0.5 * (a @ b)
-        self.assertTrue(mx.allclose(expected, out))
+            # Broadcast c
+            a = mx.ones((5, 5))
+            b = mx.ones((5, 5))
+            c = mx.ones((1, 5))
+            out = mx.addmm(c, a, b, beta=beta, alpha=alpha)
+            expected = beta * c + alpha * (a @ b)
+            self.assertTrue(mx.allclose(expected, out))

    def test_addmm_grad(self):
        def make_ref_addmm(alpha, beta):
@@ -724,33 +723,32 @@ class TestBlas(mlx_tests.MLXTestCase):
        shapes = ((1, 64, 32, 128), (4, 28, 24, 47), (1, 1, 24, 47))

        alpha = 2.0
-        beta = 0.5
+        for beta in (1.0, 0.5):
+            f_test = make_addmm(alpha, beta)
+            f_ref = make_ref_addmm(alpha, beta)

-        f_test = make_addmm(alpha, beta)
-        f_ref = make_ref_addmm(alpha, beta)
+            for B, M, N, K in shapes:
+                cotan = mx.ones((B, M, N))
+                c = mx.random.normal((B, M, N))
+                a = mx.random.normal((B, M, K))
+                b = mx.random.normal((B, K, N))

-        for B, M, N, K in shapes:
-            cotan = mx.ones((B, M, N))
-            c = mx.random.normal((B, M, N))
-            a = mx.random.normal((B, M, K))
-            b = mx.random.normal((B, K, N))
+                out_ref, dout_ref = mx.vjp(
+                    f_ref,
+                    [c, a, b],
+                    [cotan],
+                )
+                out_test, dout_test = mx.vjp(
+                    f_test,
+                    [c, a, b],
+                    [cotan],
+                )

-            out_ref, dout_ref = mx.vjp(
-                f_ref,
-                [c, a, b],
-                [cotan],
-            )
-            out_test, dout_test = mx.vjp(
-                f_test,
-                [c, a, b],
-                [cotan],
-            )
+                self.assertTrue(mx.allclose(out_ref[0], out_test[0], atol=1e-4).item())

-            self.assertTrue(mx.allclose(out_ref[0], out_test[0], atol=1e-4).item())
-
-            for r, t in zip(dout_ref, dout_test):
-                self.assertEqual(r.shape, t.shape)
-                self.assertTrue(mx.allclose(r, t, atol=1e-4).item())
+                for r, t in zip(dout_ref, dout_test):
+                    self.assertEqual(r.shape, t.shape)
+                    self.assertTrue(mx.allclose(r, t, atol=1e-4).item())

    def test_empty_matmul(self):
        a = mx.array([[], []]).T