Fused Affine Quantize/Dequantize ops (#1282)

* Add fast affine dequantize

* add full quantize kernel

* fused kernel with scale/bias computation

* fix docstring

* fix no jit error

* fix test

* test fix

* reduce fast api to only affine_quantize
This commit is contained in:
Alex Barron
2024-07-29 15:11:38 -07:00
committed by GitHub
parent aa1d6cadad
commit c52d1600f0
11 changed files with 655 additions and 400 deletions

View File

@@ -2,6 +2,7 @@
#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/tuple.h>
#include <nanobind/stl/variant.h>
#include "mlx/fast.h"
@@ -138,4 +139,47 @@ void init_fast(nb::module_& parent_module) {
Returns:
array: The output array.
)pbdoc");
m.def(
"affine_quantize",
nb::overload_cast<
const array&,
const array&,
const array&,
int,
int,
StreamOrDevice>(&fast::affine_quantize),
"w"_a,
"scales"_a,
"biases"_a,
"group_size"_a = 64,
"bits"_a = 4,
nb::kw_only(),
"stream"_a = nb::none(),
nb::sig(
"def affine_quantize(w: array, /, scales: array, biases: array, group_size: int = 64, bits: int = 4, *, stream: Union[None, Stream, Device] = None) -> array"),
R"pbdoc(
Quantize the matrix ``w`` using the provided ``scales`` and
``biases`` and the ``group_size`` and ``bits`` configuration.
Formally, given the notation in :func:`quantize`, we compute
:math:`w_i` from :math:`\hat{w_i}` and corresponding :math:`s` and
:math:`\beta` as follows
.. math::
w_i = s (\hat{w_i} + \beta)
Args:
w (array): Matrix to be quantize
scales (array): The scales to use per ``group_size`` elements of ``w``
biases (array): The biases to use per ``group_size`` elements of ``w``
group_size (int, optional): The size of the group in ``w`` that shares a
scale and bias. (default: ``64``)
bits (int, optional): The number of bits occupied by each element in
``w``. (default: ``4``)
Returns:
array: The quantized version of ``w``
)pbdoc");
}

View File

@@ -439,6 +439,18 @@ class TestFast(mlx_tests.MLXTestCase):
)(x)
self.assertTrue(mx.allclose(vmap_out, vmap_fast_out))
def test_affine_quantize(self):
mx.random.seed(7)
x = mx.random.uniform(shape=(4, 1024))
for bits in (2, 4, 8):
for group_size in (32, 64, 128):
with self.subTest(bits=bits, group_size=group_size):
w, scales, biases = mx.quantize(x, bits=bits, group_size=group_size)
w_p = mx.fast.affine_quantize(
x, scales, biases, bits=bits, group_size=group_size
)
self.assertTrue(mx.allclose(w, w_p))
if __name__ == "__main__":
unittest.main()

View File

@@ -12,11 +12,12 @@ class TestQuantized(mlx_tests.MLXTestCase):
w = mx.random.normal(shape=(128, 512))
for gs in [32, 64, 128]:
for b in [2, 4, 8]:
w_q, scales, biases = mx.quantize(w, gs, b)
w_hat = mx.dequantize(w_q, scales, biases, gs, b)
errors = (w - w_hat).abs().reshape(*scales.shape, -1)
eps = 1e-6
self.assertTrue((errors <= (scales[..., None] + eps).abs()).all())
with self.subTest(gs=gs, b=b):
w_q, scales, biases = mx.quantize(w, group_size=gs, bits=b)
w_hat = mx.dequantize(w_q, scales, biases, gs, b)
errors = (w - w_hat).abs().reshape(*scales.shape, -1)
eps = 1e-6
self.assertTrue((errors <= (scales[..., None] + eps).abs()).all())
# test quantize/dequantize 0s
a = mx.zeros((256, 512))