Fused Affine Quantize/Dequantize ops (#1282)

* Add fast affine dequantize * add full quantize kernel * fused kernel with scale/bias computation * fix docstring * fix no jit error * fix test * test fix * reduce fast api to only affine_quantize
2025-12-16 01:49:05 +08:00 · 2024-07-29 15:11:38 -07:00
parent aa1d6cadad
commit c52d1600f0
11 changed files with 655 additions and 400 deletions
--- a/mlx/backend/metal/quantized.cpp
+++ b/mlx/backend/metal/quantized.cpp
@@ -7,6 +7,7 @@
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/utils.h"
+#include "mlx/fast_primitives.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -47,8 +48,8 @@ void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    if (B < 6 && O % 8 == 0 && D % 512 == 0 && D >= 512) {
      std::ostringstream kname;
      auto type_string = get_type_string(x.dtype());
-      kname << "qmv_" << type_string << "_gs_" << group_size_ << "_b_" << bits_
-            << "_fast";
+      kname << "qmv_fast_" << type_string << "_gs_" << group_size_ << "_b_"
+            << bits_;

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
@@ -270,8 +271,8 @@ void GatherQMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    if (B < 6 && O % 8 == 0 && D % 512 == 0 && D >= 512) {
      std::ostringstream kname;
      auto type_string = get_type_string(x.dtype());
-      kname << "bs_qmv_" << type_string << "_gs_" << group_size_ << "_b_"
-            << bits_ << "_fast";
+      kname << "bs_qmv_fast_" << type_string << "_gs_" << group_size_ << "_b_"
+            << bits_;

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
@@ -513,4 +514,82 @@ void GatherQMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
 }

+void fast::AffineQuantize::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  bool compute_scale_bias = inputs.size() == 1;
+
+  auto& w_pre = inputs[0];
+  auto& out = outputs[0];
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto& s = stream();
+  auto& d = metal::device(s.device);
+
+  std::vector<array> copies;
+  auto ensure_row_contiguous = [&copies, &s](const array& arr) {
+    if (arr.flags().row_contiguous) {
+      return arr;
+    } else {
+      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+      copy_gpu(arr, arr_copy, CopyType::General, s);
+      copies.push_back(arr_copy);
+      return arr_copy;
+    }
+  };
+  auto w = ensure_row_contiguous(w_pre);
+
+  auto& compute_encoder = d.get_command_encoder(s.index);
+  compute_encoder.set_input_array(w, 0);
+  if (!compute_scale_bias) {
+    auto& scales_pre = inputs[1];
+    auto& biases_pre = inputs[2];
+    auto scales = ensure_row_contiguous(scales_pre);
+    auto biases = ensure_row_contiguous(biases_pre);
+    compute_encoder.set_input_array(scales, 1);
+    compute_encoder.set_input_array(biases, 2);
+    compute_encoder.set_output_array(out, 3);
+  } else {
+    auto& scales = outputs[1];
+    auto& biases = outputs[2];
+    scales.set_data(allocator::malloc_or_wait(scales.nbytes()));
+    biases.set_data(allocator::malloc_or_wait(biases.nbytes()));
+    compute_encoder.set_output_array(out, 1);
+    compute_encoder.set_output_array(scales, 2);
+    compute_encoder.set_output_array(biases, 3);
+  }
+
+  std::ostringstream kname;
+  auto type_string = dequantize_ ? get_type_string(out.dtype())
+                                 : get_type_string(w_pre.dtype());
+  auto kernel_func = "affine_quantize_scales_biases";
+  if (dequantize_) {
+    kernel_func = "affine_dequantize";
+  } else if (compute_scale_bias) {
+    kernel_func = "affine_quantize";
+  }
+  kname << kernel_func << "_" << type_string << "_gs_" << group_size_ << "_b_"
+        << bits_;
+  auto template_def = get_template_definition(
+      kname.str(), kernel_func, type_string, group_size_, bits_);
+  auto kernel = get_quantized_kernel(d, kname.str(), template_def);
+  compute_encoder->setComputePipelineState(kernel);
+
+  // Treat uint32 as uint8 in kernel
+  constexpr int uint8_per_uint32 = 4;
+  constexpr int simd_size = 32;
+  int packs_per_int = 8 / bits_;
+  int per_thread = compute_scale_bias ? group_size_ / simd_size : packs_per_int;
+  size_t nthreads =
+      dequantize_ ? w.size() * uint8_per_uint32 : w.size() / per_thread;
+
+  NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+  auto group_dims = MTL::Size(thread_group_size, 1, 1);
+  auto grid_dims = MTL::Size(nthreads, 1, 1);
+  compute_encoder.dispatchThreads(grid_dims, group_dims);
+
+  d.get_command_buffer(s.index)->addCompletedHandler(
+      [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+}
+
 } // namespace mlx::core