Add a quantization type in the ops

2025-08-12 20:26:40 +08:00 · 2024-12-12 01:30:38 -08:00 · 2024-12-12 01:30:38 -08:00 · bdd68bd893
commit bdd68bd893
parent 50f3535693
6 changed files with 152 additions and 44 deletions
--- a/mlx/ops.cpp
+++ b/mlx/ops.cpp
@ -75,10 +75,33 @@ std::pair<int, int> extract_quantized_matmul_dims(
    const array& x,
    const array& w,
    const array& scales,
-    const array& biases,
+    const std::optional<array>& biases,
    bool transpose,
    int group_size,
-    int bits) {
+    int bits,
    QuantizationType type) {
  // Check if we have biases as expected
  switch (type) {
    case QuantizationType::Affine:
      if (!biases.has_value()) {
        std::ostringstream msg;
        msg << "[" << tag
            << "] The biases argument is required for quantization "
            << "type '" << type << "'";
        throw std::invalid_argument(msg.str());
      }
      break;
    case QuantizationType::AffinePacked:
      if (biases.has_value()) {
        std::ostringstream msg;
        msg << "[" << tag << "] Quantization type '" << type
            << "' does not use "
            << "biases but biases were provided";
        throw std::invalid_argument(msg.str());
      }
      break;
  }
  if (w.dtype() != uint32) {
    std::ostringstream msg;
    msg << "[" << tag << "] The weight matrix should be uint32 "
@ -86,11 +109,11 @@ std::pair<int, int> extract_quantized_matmul_dims(
    throw std::invalid_argument(msg.str());
  }
-  if (scales.shape() != biases.shape()) {
+  if (biases.has_value() && scales.shape() != biases.value().shape()) {
    std::ostringstream msg;
    msg << "[" << tag << "] Scales and biases should have the same shape. "
        << "Received scales with shape " << scales.shape()
-        << " and biases with " << biases.shape();
+        << " and biases with " << biases.value().shape();
    throw std::invalid_argument(msg.str());
  }
@ -99,25 +122,33 @@ std::pair<int, int> extract_quantized_matmul_dims(
    std::ostringstream msg;
    msg << "[" << tag
        << "] Weight, scales and biases should have the same batch shape. "
-        << "Received weight with shape " << w.shape() << ", scales with "
+        << "Received weight with shape " << w.shape()
-        << scales.shape() << " and biases with " << biases.shape();
+        << " and scales/biases with " << scales.shape();
    throw std::invalid_argument(msg.str());
  }
-  if (w.shape(-1) * 32 / bits != scales.shape(-1) * group_size) {
+  int weight_dims = w.shape(-1) * 32 / bits;
  int scales_dims = scales.shape(-1) * group_size;
  if (type == QuantizationType::AffinePacked) {
    scales_dims /= 8;
  }
  if (weight_dims != scales_dims) {
    std::ostringstream msg;
    msg << "[" << tag << "] The shapes of the weight and scales are "
-        << "incompatible based on bits and group_size. w.shape() == "
+        << "incompatible based on bits, group_size and quantization type. "
-        << w.shape() << " and scales.shape() == " << scales.shape()
+        << "w.shape() == " << w.shape()
-        << " with group_size=" << group_size << " and bits=" << bits;
+        << " and scales.shape() == " << scales.shape()
        << " with group_size=" << group_size << ", bits=" << bits
        << " and type='" << type << "'";
    throw std::invalid_argument(msg.str());
  }
  int x_inner_dims = x.shape(-1);
  // Calculate the expanded w's dims
-  int w_inner_dims = (transpose) ? w.shape(-1) * 32 / bits : w.shape(-2);
+  int w_inner_dims = (transpose) ? weight_dims : w.shape(-2);
-  int w_outer_dims = (transpose) ? w.shape(-2) : w.shape(-1) * 32 / bits;
+  int w_outer_dims = (transpose) ? w.shape(-2) : weight_dims;
  if (w_inner_dims != x_inner_dims) {
    std::ostringstream msg;
@ -3662,14 +3693,23 @@ array quantized_matmul(
    array x,
    array w,
    array scales,
-    array biases,
+    std::optional<array> biases,
    bool transpose /* = true */,
    int group_size /* = 64 */,
    int bits /* = 4 */,
    QuantizationType type /* = QuantizationType::Affine */,
    StreamOrDevice s /* = {} */) {
  // Check and extract the quantized matrix shape against x
  auto [w_inner_dims, w_outer_dims] = extract_quantized_matmul_dims(
-      "quantized_matmul", x, w, scales, biases, transpose, group_size, bits);
+      "quantized_matmul",
      x,
      w,
      scales,
      biases,
      transpose,
      group_size,
      bits,
      type);
  // QuantizedMatmul handles w.ndim == 2 case.
  if (x.ndim() > 2 && w.ndim() > 2) {
@ -3690,37 +3730,53 @@ array quantized_matmul(
    *(inner_shape.end() - 1) = scales.shape(-1);
    scales = broadcast_to(scales, inner_shape, s);
-    *(inner_shape.end() - 1) = biases.shape(-1);
+    if (biases.has_value()) {
-    biases = broadcast_to(biases, inner_shape, s);
+      *(inner_shape.end() - 1) = biases.value().shape(-1);
      biases = broadcast_to(biases.value(), inner_shape, s);
    }
  }
-  auto dtype = result_type(x, scales, biases);
+  auto dtype = result_type(x, scales);
  if (biases.has_value()) {
    dtype = promote_types(dtype, biases.value().dtype());
  }
  if (!issubdtype(dtype, floating)) {
    std::ostringstream msg;
    msg << "[quantized_matmul] Only real floating types are supported but "
        << "the passed types where x.dtype() == " << x.dtype()
-        << ", scales.dtype() == " << scales.dtype()
+        << ", scales.dtype() == " << scales.dtype();
-        << " and biases.dtype() == " << biases.dtype();
+    if (biases.has_value()) {
      msg << " and biases.dtype() == " << biases.value().dtype();
    }
    throw std::invalid_argument(msg.str());
  }
  // Prepare the inputs vector
  std::vector<array> inputs;
  inputs.reserve(4);
  inputs.push_back(astype(x, dtype, s));
  inputs.push_back(w);
  inputs.push_back(astype(scales, dtype, s));
  if (biases.has_value()) {
    inputs.push_back(astype(biases.value(), dtype, s));
  }
  auto out_shape = x.shape();
  out_shape.back() = w_outer_dims;
  return array(
      std::move(out_shape),
      dtype,
      std::make_shared<QuantizedMatmul>(
-          to_stream(s), group_size, bits, transpose),
+          to_stream(s), type, group_size, bits, transpose),
-      {astype(x, dtype, s),
+      std::move(inputs));
       w,
       astype(scales, dtype, s),
       astype(biases, dtype, s)});
 }
-std::tuple<array, array, array> quantize(
+std::tuple<array, array, std::optional<array>> quantize(
    const array& w,
    int group_size /* = 64 */,
    int bits /* = 4 */,
    QuantizationType type /* = QuantizationType::Affine */,
    StreamOrDevice s /* = {} */) {
  return fast::affine_quantize(w, group_size, bits, s);
 }
@ -3728,31 +3784,40 @@ std::tuple<array, array, array> quantize(
 array dequantize(
    const array& w,
    const array& scales,
-    const array& biases,
+    const std::optional<array>& biases,
    int group_size /* = 64 */,
    int bits /* = 4 */,
    QuantizationType type /* = QuantizationType::Affine */,
    StreamOrDevice s /* = {} */) {
-  return fast::affine_dequantize(w, scales, biases, group_size, bits, s);
+  return fast::affine_dequantize(
      w, scales, biases.value(), group_size, bits, s);
 }
 array gather_qmm(
    const array& x,
    const array& w,
    const array& scales,
-    const array& biases,
+    const std::optional<array>& biases,
-    std::optional<array> lhs_indices_ /* = std::nullopt */,
+    const std::optional<array>& lhs_indices_ /* = std::nullopt */,
-    std::optional<array> rhs_indices_ /* = std::nullopt */,
+    const std::optional<array>& rhs_indices_ /* = std::nullopt */,
    bool transpose /* = true */,
    int group_size /* = 64 */,
    int bits /* = 4 */,
    QuantizationType type /* = QuantizationType::Affine */,
    StreamOrDevice s /* = {} */) {
  if (!lhs_indices_ && !rhs_indices_) {
    return quantized_matmul(
-        x, w, scales, biases, transpose, group_size, bits, s);
+        x, w, scales, biases, transpose, group_size, bits, type, s);
  }
  if (type != QuantizationType::Affine) {
    std::ostringstream msg;
    msg << "[gather_qmm] Only quantization type '" << type << "' supported";
    throw std::invalid_argument(msg.str());
  }
  auto [w_inner_dims, w_outer_dims] = extract_quantized_matmul_dims(
-      "gather_qmm", x, w, scales, biases, transpose, group_size, bits);
+      "gather_qmm", x, w, scales, biases, transpose, group_size, bits, type);
  // Extract indices and broadcast them
  array lhs_indices = indices_or_default(lhs_indices_, x, s);
@ -3768,16 +3833,17 @@ array gather_qmm(
  out_shape.push_back(w_outer_dims);
  // and output type
-  auto out_type = result_type(x, scales, biases);
+  auto out_type = result_type(x, scales, biases.value());
  return array(
      std::move(out_shape),
      out_type,
-      std::make_shared<GatherQMM>(to_stream(s), group_size, bits, transpose),
+      std::make_shared<GatherQMM>(
          to_stream(s), type, group_size, bits, transpose),
      {astype(x, out_type, s),
       w,
       astype(scales, out_type, s),
-       astype(biases, out_type, s),
+       astype(biases.value(), out_type, s),
       lhs_indices,
       rhs_indices});
 }
--- a/mlx/ops.h
+++ b/mlx/ops.h
@ -1277,31 +1277,34 @@ array conv_transpose3d(
    int groups = 1,
    StreamOrDevice s = {});
-/** Quantized matmul multiplies x with a quantized matrix w*/
+/** Quantized matmul multiplies x with a quantized matrix w */
 array quantized_matmul(
    array x,
    array w,
    array scales,
-    array biases,
+    std::optional<array> biases,
    bool transpose = true,
    int group_size = 64,
    int bits = 4,
    QuantizationType type = QuantizationType::Affine,
    StreamOrDevice s = {});
 /** Quantize a matrix along its last axis */
-std::tuple<array, array, array> quantize(
+std::tuple<array, array, std::optional<array>> quantize(
    const array& w,
    int group_size = 64,
    int bits = 4,
    QuantizationType type = QuantizationType::Affine,
    StreamOrDevice s = {});
 /** Dequantize a matrix produced by quantize() */
 array dequantize(
    const array& w,
    const array& scales,
-    const array& biases,
+    const std::optional<array>& biases,
    int group_size = 64,
    int bits = 4,
    QuantizationType type = QuantizationType::Affine,
    StreamOrDevice s = {});
 /** Compute matrix products with matrix-level gather. */
@ -1309,12 +1312,13 @@ array gather_qmm(
    const array& x,
    const array& w,
    const array& scales,
-    const array& biases,
+    const std::optional<array>& biases,
-    std::optional<array> lhs_indices = std::nullopt,
+    const std::optional<array>& lhs_indices = std::nullopt,
-    std::optional<array> rhs_indices = std::nullopt,
+    const std::optional<array>& rhs_indices = std::nullopt,
    bool transpose = true,
    int group_size = 64,
    int bits = 4,
    QuantizationType type = QuantizationType::Affine,
    StreamOrDevice s = {});
 /** Returns a contraction of a and b over multiple dimensions. */
--- a/mlx/primitives.cpp
+++ b/mlx/primitives.cpp
@ -2777,10 +2777,11 @@ std::vector<array> QuantizedMatmul::vjp(
          cotangents[0],
          primals[1],
          primals[2],
-          primals[3],
+          (primals.size() > 3) ? std::optional(primals[3]) : std::nullopt,
          !transpose_,
          group_size_,
          bits_,
          type_,
          stream()));
    }
@ -2855,6 +2856,7 @@ std::vector<array> GatherQMM::vjp(
                      !transpose_,
                      group_size_,
                      bits_,
                      type_,
                      stream()),
                  -3,
                  stream()),
--- a/mlx/primitives.h
+++ b/mlx/primitives.h
@ -8,6 +8,7 @@
 #include "mlx/device.h"
 #include "mlx/io/load.h"
 #include "mlx/stream.h"
 #include "mlx/utils.h"
 #define DEFINE_VMAP()                                                 \
  virtual std::pair<std::vector<array>, std::vector<int>> vmap(       \
@ -1568,10 +1569,12 @@ class QuantizedMatmul : public UnaryPrimitive {
 public:
  explicit QuantizedMatmul(
      Stream stream,
      QuantizationType type,
      int group_size,
      int bits,
      bool transpose)
      : UnaryPrimitive(stream),
        type_(type),
        group_size_(group_size),
        bits_(bits),
        transpose_(transpose) {}
@ -1586,6 +1589,7 @@ class QuantizedMatmul : public UnaryPrimitive {
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
 private:
  QuantizationType type_;
  int group_size_;
  int bits_;
  bool transpose_;
@ -1595,8 +1599,14 @@ class QuantizedMatmul : public UnaryPrimitive {
 class GatherQMM : public UnaryPrimitive {
 public:
-  explicit GatherQMM(Stream stream, int group_size, int bits, bool transpose)
+  explicit GatherQMM(
      Stream stream,
      QuantizationType type,
      int group_size,
      int bits,
      bool transpose)
      : UnaryPrimitive(stream),
        type_(type),
        group_size_(group_size),
        bits_(bits),
        transpose_(transpose) {}
@ -1610,6 +1620,7 @@ class GatherQMM : public UnaryPrimitive {
  bool is_equivalent(const Primitive& other) const override;
 private:
  QuantizationType type_;
  int group_size_;
  int bits_;
  bool transpose_;
--- a/mlx/utils.cpp
+++ b/mlx/utils.cpp
@ -145,6 +145,19 @@ std::ostream& operator<<(std::ostream& os, uint8_t x) {
  return os;
 }
 std::ostream& operator<<(std::ostream& os, QuantizationType type) {
  std::string_view quantization_type;
  switch (type) {
    case QuantizationType::Affine:
      quantization_type = "affine";
      break;
    case QuantizationType::AffinePacked:
      quantization_type = "affine-packed";
      break;
  }
  return os << quantization_type;
 }
 namespace {
 inline size_t
--- a/mlx/utils.h
+++ b/mlx/utils.h
@ -100,6 +100,18 @@ inline int next_power_of_2(int n) {
  return pow(2, std::ceil(std::log2(n)));
 }
 /** Enumerate the different quantization types */
 enum class QuantizationType {
  // Traditional affine quantization with separate scales and biases
  Affine,
  // The same quantization as affine but with the scales and biases packed in a
  // single array and contiguously every 4 rows
  AffinePacked,
 };
 std::ostream& operator<<(std::ostream& os, QuantizationType type);
 namespace env {
 int get_var(const char* name, int default_value);