Make sure 0 is represented in the quantization (#1016)

2025-09-18 18:28:12 +08:00 · 2024-04-19 19:47:26 -07:00
parent ed83908931
commit 84d61d27aa
3 changed files with 13 additions and 3 deletions
--- a/mlx/ops.cpp
+++ b/mlx/ops.cpp
@@ -3268,6 +3268,9 @@ std::tuple<array, array, array> quantize(
  array scales = squeeze(delta, -1, s);
  array biases = squeeze(w_min, -1, s);

+  // making sure that 0 is represented exactly in the resulting quantization
+  biases = multiply(round(divide(biases, scales, s), s), scales, s);
+
  // Quantize and pack w
  packed_w =
      astype(round(divide(subtract(packed_w, w_min, s), delta, s), s), uint32);