Fused Affine Quantize/Dequantize ops (#1282)

* Add fast affine dequantize * add full quantize kernel * fused kernel with scale/bias computation * fix docstring * fix no jit error * fix test * test fix * reduce fast api to only affine_quantize
2025-12-11 23:14:50 +08:00 · 2024-07-29 15:11:38 -07:00
parent aa1d6cadad
commit c52d1600f0
11 changed files with 655 additions and 400 deletions
--- a/python/src/fast.cpp
+++ b/python/src/fast.cpp
@@ -2,6 +2,7 @@

 #include <nanobind/nanobind.h>
 #include <nanobind/stl/optional.h>
+#include <nanobind/stl/tuple.h>
 #include <nanobind/stl/variant.h>

 #include "mlx/fast.h"
@@ -138,4 +139,47 @@ void init_fast(nb::module_& parent_module) {
        Returns:
            array: The output array.
      )pbdoc");
+
+  m.def(
+      "affine_quantize",
+      nb::overload_cast<
+          const array&,
+          const array&,
+          const array&,
+          int,
+          int,
+          StreamOrDevice>(&fast::affine_quantize),
+      "w"_a,
+      "scales"_a,
+      "biases"_a,
+      "group_size"_a = 64,
+      "bits"_a = 4,
+      nb::kw_only(),
+      "stream"_a = nb::none(),
+      nb::sig(
+          "def affine_quantize(w: array, /, scales: array, biases: array, group_size: int = 64, bits: int = 4, *, stream: Union[None, Stream, Device] = None) -> array"),
+      R"pbdoc(
+        Quantize the matrix ``w`` using the provided ``scales`` and
+        ``biases`` and the ``group_size`` and ``bits`` configuration.
+
+        Formally, given the notation in :func:`quantize`, we compute
+        :math:`w_i` from :math:`\hat{w_i}` and corresponding :math:`s` and
+        :math:`\beta` as follows
+
+        .. math::
+
+          w_i = s (\hat{w_i} + \beta)
+
+        Args:
+          w (array): Matrix to be quantize
+          scales (array): The scales to use per ``group_size`` elements of ``w``
+          biases (array): The biases to use per ``group_size`` elements of ``w``
+          group_size (int, optional): The size of the group in ``w`` that shares a
+            scale and bias. (default: ``64``)
+          bits (int, optional): The number of bits occupied by each element in
+            ``w``. (default: ``4``)
+
+        Returns:
+          array: The quantized version of ``w``
+      )pbdoc");
 }
--- a/python/tests/test_fast.py
+++ b/python/tests/test_fast.py
@@ -439,6 +439,18 @@ class TestFast(mlx_tests.MLXTestCase):
        )(x)
        self.assertTrue(mx.allclose(vmap_out, vmap_fast_out))

+    def test_affine_quantize(self):
+        mx.random.seed(7)
+        x = mx.random.uniform(shape=(4, 1024))
+        for bits in (2, 4, 8):
+            for group_size in (32, 64, 128):
+                with self.subTest(bits=bits, group_size=group_size):
+                    w, scales, biases = mx.quantize(x, bits=bits, group_size=group_size)
+                    w_p = mx.fast.affine_quantize(
+                        x, scales, biases, bits=bits, group_size=group_size
+                    )
+                    self.assertTrue(mx.allclose(w, w_p))
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/tests/test_quantized.py
+++ b/python/tests/test_quantized.py
@@ -12,11 +12,12 @@ class TestQuantized(mlx_tests.MLXTestCase):
        w = mx.random.normal(shape=(128, 512))
        for gs in [32, 64, 128]:
            for b in [2, 4, 8]:
-                w_q, scales, biases = mx.quantize(w, gs, b)
-                w_hat = mx.dequantize(w_q, scales, biases, gs, b)
-                errors = (w - w_hat).abs().reshape(*scales.shape, -1)
-                eps = 1e-6
-                self.assertTrue((errors <= (scales[..., None] + eps).abs()).all())
+                with self.subTest(gs=gs, b=b):
+                    w_q, scales, biases = mx.quantize(w, group_size=gs, bits=b)
+                    w_hat = mx.dequantize(w_q, scales, biases, gs, b)
+                    errors = (w - w_hat).abs().reshape(*scales.shape, -1)
+                    eps = 1e-6
+                    self.assertTrue((errors <= (scales[..., None] + eps).abs()).all())

        # test quantize/dequantize 0s
        a = mx.zeros((256, 512))