From ec2ab428883790d14ce8ca719abafdbe380453bb Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Fri, 19 Sep 2025 18:22:55 -0700 Subject: [PATCH] Lower sorted QMM gather threshold (#2609) --- mlx/backend/metal/quantized.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlx/backend/metal/quantized.cpp b/mlx/backend/metal/quantized.cpp index 333f3971f..452bc4faa 100644 --- a/mlx/backend/metal/quantized.cpp +++ b/mlx/backend/metal/quantized.cpp @@ -948,8 +948,8 @@ void GatherQMM::eval_gpu(const std::vector& inputs, array& out) { // We are walking x in order and w is also in order so we can batch up the // matmuls and reuse reading x and w. // - // TODO: Tune 16 and 8 here a bit better. - if (M == 1 && B >= 16 && right_sorted_ == true && B / E >= 8) { + // TODO: Tune 16 and 4 here a bit better. + if (M == 1 && B >= 16 && right_sorted_ == true && B / E >= 4) { gather_qmm_rhs( x, w,