Batched Quantized Matmul + Fast Small QMV (#1503)

* add fast qmv for small dims * fix test * batched cpu * add batched template param * refactor metal quantized.cpp
2025-12-16 01:49:05 +08:00 · 2024-10-21 16:23:17 -07:00
parent 58a855682c
commit d15fa13daf
9 changed files with 866 additions and 761 deletions
--- a/mlx/ops.h
+++ b/mlx/ops.h
@@ -1287,10 +1287,10 @@ array conv_transpose3d(

 /** Quantized matmul multiplies x with a quantized matrix w*/
 array quantized_matmul(
-    const array& x,
-    const array& w,
-    const array& scales,
-    const array& biases,
+    array x,
+    array w,
+    array scales,
+    array biases,
    bool transpose = true,
    int group_size = 64,
    int bits = 4,