mirror of
https://github.com/ml-explore/mlx.git
synced 2025-09-03 22:34:43 +08:00
Fused Affine Quantize/Dequantize ops (#1282)
* Add fast affine dequantize * add full quantize kernel * fused kernel with scale/bias computation * fix docstring * fix no jit error * fix test * test fix * reduce fast api to only affine_quantize
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
|
||||
#include <nanobind/nanobind.h>
|
||||
#include <nanobind/stl/optional.h>
|
||||
#include <nanobind/stl/tuple.h>
|
||||
#include <nanobind/stl/variant.h>
|
||||
|
||||
#include "mlx/fast.h"
|
||||
@@ -138,4 +139,47 @@ void init_fast(nb::module_& parent_module) {
|
||||
Returns:
|
||||
array: The output array.
|
||||
)pbdoc");
|
||||
|
||||
m.def(
|
||||
"affine_quantize",
|
||||
nb::overload_cast<
|
||||
const array&,
|
||||
const array&,
|
||||
const array&,
|
||||
int,
|
||||
int,
|
||||
StreamOrDevice>(&fast::affine_quantize),
|
||||
"w"_a,
|
||||
"scales"_a,
|
||||
"biases"_a,
|
||||
"group_size"_a = 64,
|
||||
"bits"_a = 4,
|
||||
nb::kw_only(),
|
||||
"stream"_a = nb::none(),
|
||||
nb::sig(
|
||||
"def affine_quantize(w: array, /, scales: array, biases: array, group_size: int = 64, bits: int = 4, *, stream: Union[None, Stream, Device] = None) -> array"),
|
||||
R"pbdoc(
|
||||
Quantize the matrix ``w`` using the provided ``scales`` and
|
||||
``biases`` and the ``group_size`` and ``bits`` configuration.
|
||||
|
||||
Formally, given the notation in :func:`quantize`, we compute
|
||||
:math:`w_i` from :math:`\hat{w_i}` and corresponding :math:`s` and
|
||||
:math:`\beta` as follows
|
||||
|
||||
.. math::
|
||||
|
||||
w_i = s (\hat{w_i} + \beta)
|
||||
|
||||
Args:
|
||||
w (array): Matrix to be quantize
|
||||
scales (array): The scales to use per ``group_size`` elements of ``w``
|
||||
biases (array): The biases to use per ``group_size`` elements of ``w``
|
||||
group_size (int, optional): The size of the group in ``w`` that shares a
|
||||
scale and bias. (default: ``64``)
|
||||
bits (int, optional): The number of bits occupied by each element in
|
||||
``w``. (default: ``4``)
|
||||
|
||||
Returns:
|
||||
array: The quantized version of ``w``
|
||||
)pbdoc");
|
||||
}
|
||||
|
@@ -439,6 +439,18 @@ class TestFast(mlx_tests.MLXTestCase):
|
||||
)(x)
|
||||
self.assertTrue(mx.allclose(vmap_out, vmap_fast_out))
|
||||
|
||||
def test_affine_quantize(self):
|
||||
mx.random.seed(7)
|
||||
x = mx.random.uniform(shape=(4, 1024))
|
||||
for bits in (2, 4, 8):
|
||||
for group_size in (32, 64, 128):
|
||||
with self.subTest(bits=bits, group_size=group_size):
|
||||
w, scales, biases = mx.quantize(x, bits=bits, group_size=group_size)
|
||||
w_p = mx.fast.affine_quantize(
|
||||
x, scales, biases, bits=bits, group_size=group_size
|
||||
)
|
||||
self.assertTrue(mx.allclose(w, w_p))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@@ -12,11 +12,12 @@ class TestQuantized(mlx_tests.MLXTestCase):
|
||||
w = mx.random.normal(shape=(128, 512))
|
||||
for gs in [32, 64, 128]:
|
||||
for b in [2, 4, 8]:
|
||||
w_q, scales, biases = mx.quantize(w, gs, b)
|
||||
w_hat = mx.dequantize(w_q, scales, biases, gs, b)
|
||||
errors = (w - w_hat).abs().reshape(*scales.shape, -1)
|
||||
eps = 1e-6
|
||||
self.assertTrue((errors <= (scales[..., None] + eps).abs()).all())
|
||||
with self.subTest(gs=gs, b=b):
|
||||
w_q, scales, biases = mx.quantize(w, group_size=gs, bits=b)
|
||||
w_hat = mx.dequantize(w_q, scales, biases, gs, b)
|
||||
errors = (w - w_hat).abs().reshape(*scales.shape, -1)
|
||||
eps = 1e-6
|
||||
self.assertTrue((errors <= (scales[..., None] + eps).abs()).all())
|
||||
|
||||
# test quantize/dequantize 0s
|
||||
a = mx.zeros((256, 512))
|
||||
|
Reference in New Issue
Block a user