Add quantize/dequantize for mxfp8 and nvfp4 (#2688)

* Add quantize/dequantize slow path for mxfp8 and nvfp4 * fast cuda kernel for mx/nv quantization * fallback for cuda < 12.8 (#2697) * format (#2700) * fix (#2701) * metal kernels * docs * fix jit * add default bits and group sizes * improve quant docs * fix output type of mxfp4 matmuls
2025-12-16 01:49:05 +08:00 · 2025-10-28 16:23:12 -07:00
parent 460691a0e8
commit ec72b44417
25 changed files with 1400 additions and 588 deletions
--- a/python/tests/test_quantized.py
+++ b/python/tests/test_quantized.py
@@ -55,26 +55,109 @@ class TestQuantized(mlx_tests.MLXTestCase):

        # Invalid bits / group size
        with self.assertRaises(ValueError):
-            mx.quantize(w, bits=3, group_size=32, mode="mxfp4")
+            mx.quantize(w, bits=3, mode="mxfp4")

        with self.assertRaises(ValueError):
-            mx.quantize(w, group_size=64, bits=4, mode="mxfp4")
+            mx.quantize(w, group_size=64, mode="mxfp4")

-        w_q, scales = mx.quantize(w, group_size=32, bits=4, mode="mxfp4")
+        w_q, scales = mx.quantize(w, mode="mxfp4")
+        with self.assertRaises(ValueError):
+            mx.dequantize(w_q, scales, bits=3, mode="mxfp4")

        with self.assertRaises(ValueError):
-            mx.dequantize(w_q, scales, bits=3, group_size=32, mode="mxfp4")
+            mx.dequantize(w_q, scales, group_size=64, mode="mxfp4")

+        # Invalid output type
        with self.assertRaises(ValueError):
-            mx.dequantize(w_q, scales, group_size=64, bits=4, mode="mxfp4")
+            mx.dequantize(
+                w_q, scales, group_size=32, bits=4, mode="mxfp4", dtype=mx.int32
+            )

-        w_hat = mx.dequantize(w_q, scales, group_size=32, bits=4, mode="mxfp4")
+        w_hat = mx.dequantize(w_q, scales, mode="mxfp4")
        self.assertTrue(mx.allclose(w, w_hat, rtol=1e-5, atol=1e-5))

        # test quantize/dequantize 0s
        a = mx.zeros((256, 512))
-        w_q, scales = mx.quantize(a, group_size=32, bits=4, mode="mxfp4")
-        w_hat = mx.dequantize(w_q, scales, group_size=32, bits=4, mode="mxfp4")
+        w_q, scales = mx.quantize(a, mode="mxfp4")
+        w_hat = mx.dequantize(w_q, scales, mode="mxfp4")
+        self.assertTrue(mx.all(w_hat == 0))
+
+    def test_mxfp8_quantize_dequantize(self):
+        w = 2 * mx.random.uniform(shape=(512, 32)) - 1
+        w = w.astype(mx.bfloat16)
+
+        # Invalid bits / group size
+        with self.assertRaises(ValueError):
+            mx.quantize(w, bits=3, mode="mxfp8")
+
+        with self.assertRaises(ValueError):
+            mx.quantize(w, group_size=32, bits=7, mode="mxfp8")
+        w_q, scales = mx.quantize(w, group_size=32, mode="mxfp8")
+
+        with self.assertRaises(ValueError):
+            mx.dequantize(w_q, scales, group_size=16, mode="mxfp8")
+
+        with self.assertRaises(ValueError):
+            mx.dequantize(w_q, scales, bits=4, mode="mxfp8")
+
+        w_hat = mx.dequantize(w_q, scales, mode="mxfp8")
+
+        self.assertTrue(mx.allclose(w, w_hat, rtol=1e-1, atol=1e-1))
+
+        # test quantize/dequantize 0s
+        a = mx.zeros((256, 512))
+        w_q, scales = mx.quantize(a, mode="mxfp8")
+        w_hat = mx.dequantize(w_q, scales, mode="mxfp8")
+        self.assertTrue(mx.all(w_hat == 0))
+
+    def test_nvfp4_quantize_dequantize(self):
+        lut = mx.array(
+            [
+                +0.0,
+                +0.5,
+                +1.0,
+                +1.5,
+                +2.0,
+                +3.0,
+                +4.0,
+                +6.0,
+                -0.0,
+                -0.5,
+                -1.0,
+                -1.5,
+                -2.0,
+                -3.0,
+                -4.0,
+                -6.0,
+            ]
+        )
+        w = lut[mx.random.randint(0, 16, shape=(128, 512))]
+        w = w.reshape(-1, 16)
+        w[:, 0] = 6
+        w = (w + 3e-6).astype(mx.bfloat16)
+
+        # Invalid bits / group size
+        with self.assertRaises(ValueError):
+            mx.quantize(w, bits=3, mode="nvfp4")
+
+        with self.assertRaises(ValueError):
+            mx.quantize(w, group_size=64, mode="nvfp4")
+
+        w_q, scales = mx.quantize(w, mode="nvfp4")
+
+        with self.assertRaises(ValueError):
+            mx.dequantize(w_q, scales, bits=3, mode="nvfp4")
+
+        with self.assertRaises(ValueError):
+            mx.dequantize(w_q, scales, group_size=32, mode="nvfp4")
+
+        w_hat = mx.dequantize(w_q, scales, mode="nvfp4")
+        self.assertTrue(mx.allclose(w, w_hat, rtol=1e-5, atol=1e-5))
+
+        # test quantize/dequantize 0s
+        a = mx.zeros((256, 512))
+        w_q, scales = mx.quantize(a, mode="nvfp4")
+        w_hat = mx.dequantize(w_q, scales, mode="nvfp4")
        self.assertTrue(mx.all(w_hat == 0))

    def test_qmm(self):
@@ -662,6 +745,25 @@ class TestQuantized(mlx_tests.MLXTestCase):
            test_shape(32, 512, 32, transpose=False, **kwargs)
            test_shape(1, 512, 32, transpose=False, **kwargs)

+    def test_qmm_mxfp4_type(self):
+        indices = mx.array([[2], [0], [1]], dtype=mx.uint32)
+
+        for t in [mx.bfloat16, mx.float16, mx.float32]:
+            x = mx.random.normal((32, 256)).astype(t)
+
+            w = mx.random.normal((32, 256))
+            wq, s = mx.quantize(w, mode="mxfp4", bits=4, group_size=32)
+            out = mx.quantized_matmul(x, wq, s, mode="mxfp4", group_size=32, bits=4)
+            self.assertEqual(out.dtype, t)
+
+            w = mx.random.normal((4, 32, 256))
+            wq, s = mx.quantize(w, mode="mxfp4", bits=4, group_size=32)
+
+            out = mx.gather_qmm(
+                x, wq, s, rhs_indices=indices, mode="mxfp4", group_size=32, bits=4
+            )
+            self.assertEqual(out.dtype, t)
+
    def test_gather_matmul_grad(self):
        def quantize(w, transpose=True, group_size=64, bits=4):
            qw, s, b = mx.quantize(w, group_size=group_size, bits=bits)