Batched Quantized Matmul + Fast Small QMV (#1503)

* add fast qmv for small dims * fix test * batched cpu * add batched template param * refactor metal quantized.cpp
2025-12-16 01:49:05 +08:00 · 2024-10-21 16:23:17 -07:00
parent 58a855682c
commit d15fa13daf
9 changed files with 866 additions and 761 deletions
--- a/mlx/fast.cpp
+++ b/mlx/fast.cpp
@@ -725,15 +725,6 @@ affine_quantize(const array& w, int group_size, int bits, StreamOrDevice s_) {

  int el_per_int = 32 / bits;

-  if (w.shape(-1) < 32 * el_per_int) {
-    std::ostringstream msg;
-    msg << "[quantize] The feature dimension (2nd dimension of the matrix) is "
-        << "too small for quantization. We support >=512 for 2 bits, "
-        << ">= 256 for 4 bits and >= 128 for 8 bits. The provided matrix has "
-        << "shape " << w.shape() << ".";
-    throw std::invalid_argument(msg.str());
-  }
-
  auto fallback = [group_size, bits, el_per_int, s](
                      const std::vector<array>& inputs) -> std::vector<array> {
    auto& w = inputs[0];