Fix qmm_t for unaligned cases (#923)

2025-12-12 23:39:04 +08:00 · 2024-03-28 15:34:57 -07:00
parent 46caf0bef0
commit 5f9ba3019f
2 changed files with 15 additions and 1 deletions
--- a/mlx/backend/metal/kernels/quantized.metal
+++ b/mlx/backend/metal/kernels/quantized.metal
@@ -520,6 +520,7 @@ template <typename T, const int BM, const int BK, const int BN, const int group_
  const int K_g = K / group_size;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;
  x += y_row * K;
  w += y_col * K_w;
  scales += y_col * K_g;
@@ -572,7 +573,10 @@ template <typename T, const int BM, const int BK, const int BN, const int group_
          const device uint32_t * w_local = w + offset_row * K_w + offset_col;
          threadgroup T * Ws_local = Ws + offset_row * BK + offset_col * el_per_int;
-          if (y_row + offset_row < N) {
+          // y_col corresponds to the row of the weight matrix and added to
          // offset_row it should be less than the total number of rows
          // otherwise skip.
          if (y_col + offset_row < N) {
            uint32_t wi = *w_local;
            T scale = scales_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
            T bias = biases_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
--- a/python/tests/test_quantized.py
+++ b/python/tests/test_quantized.py
@@ -229,6 +229,16 @@ class TestQuantized(mlx_tests.MLXTestCase):
        self.assertEqual(y_q.shape, y_hat.shape)
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)
        # Test with larger than 128 unaligned sizes
        w = mx.random.normal(shape=(99, 256))
        w_q, scales, biases = mx.quantize(w)
        w_hat = mx.dequantize(w_q, scales, biases)
        x = mx.random.normal(shape=(129, 256))
        y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=True)
        y_hat = x @ w_hat.T
        self.assertEqual(y_q.shape, y_hat.shape)
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)
 if __name__ == "__main__":
    unittest.main()