Add quantize/dequantize for mxfp8 and nvfp4 (#2688)

* Add quantize/dequantize slow path for mxfp8 and nvfp4 * fast cuda kernel for mx/nv quantization * fallback for cuda < 12.8 (#2697) * format (#2700) * fix (#2701) * metal kernels * docs * fix jit * add default bits and group sizes * improve quant docs * fix output type of mxfp4 matmuls
2025-12-16 01:49:05 +08:00 · 2025-10-28 16:23:12 -07:00
parent 460691a0e8
commit ec72b44417
25 changed files with 1400 additions and 588 deletions
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -51,6 +51,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/fp_quantize.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/convert_fp8.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
@@ -58,6 +59,11 @@ target_sources(
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)

+# fp4 is not available on < 12.8
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8.0)
+  target_include_directories(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/)
+endif()
+
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9.0)
  target_sources(
    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm_batched_12_9.cu)
--- a/mlx/backend/cuda/quantized/affine_quantize.cu
+++ b/mlx/backend/cuda/quantized/affine_quantize.cu
@@ -306,7 +306,7 @@ void affine_dequantize(
  enc.set_input_array(scales);
  enc.set_input_array(biases);
  enc.set_output_array(w);
-  dispatch_float_types(w.dtype(), "affine_quantize", [&](auto type_tag) {
+  dispatch_float_types(w.dtype(), "affine_dequantize", [&](auto type_tag) {
    dispatch_groups(group_size_, [&](auto group_size) {
      dispatch_bits(bits_, [&](auto bits) {
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
--- a/mlx/backend/cuda/quantized/cuda_fp4.h
+++ b/mlx/backend/cuda/quantized/cuda_fp4.h
@@ -0,0 +1,83 @@
+#pragma once
+
+struct __nv_fp8_e8m0 {
+  __device__ __nv_fp8_e8m0(float x) {
+    if (!std::isfinite(x)) {
+      __x = 0xFF;
+      return;
+    }
+    if (x < 0.0f) {
+      __x = 0x00;
+      return;
+    }
+    float le = std::log2f(x);
+    int n = static_cast<int>(std::nearbyintf(le));
+
+    n = n < -127 ? -127 : n;
+    n = n > 127 ? 127 : n;
+    __x = static_cast<uint8_t>(n + 127);
+  }
+
+  __device__ operator float() {
+    if (__x == 0xFF) {
+      return std::numeric_limits<float>::quiet_NaN();
+    }
+    return std::ldexp(1.0f, static_cast<int>(__x) - 127);
+  }
+
+  uint8_t __x{0};
+};
+
+struct __nv_fp4_e2m1 {
+  __device__ __nv_fp4_e2m1(float x) {
+    if (std::isnan(x)) {
+      __x = 0x7;
+      return;
+    }
+
+    const uint8_t sign_bit = (std::signbit(x)) ? 0x8 : 0x0;
+    x = std::abs(x);
+
+    if (x > 5.0f) {
+      __x = 0x7;
+    } else if (x >= 3.5f) {
+      __x = 0x6;
+    } else if (x > 2.5f) {
+      __x = 0x5;
+    } else if (x >= 1.75f) {
+      __x = 0x4;
+    } else if (x > 1.25f) {
+      __x = 0x3;
+    } else if (x >= 0.75f) {
+      __x = 0x2;
+    } else if (x > 0.25f) {
+      __x = 0x1;
+    } else {
+      __x = 0x0;
+    }
+    __x |= sign_bit;
+  }
+
+  __device__ operator float() {
+    static const float LUT[16] = {
+        0.0f,
+        0.5f,
+        1.0f,
+        1.5f,
+        2.0f,
+        3.0f,
+        4.0f,
+        6.0f,
+        -0.0f,
+        -0.5f,
+        -1.0f,
+        -1.5f,
+        -2.0f,
+        -3.0f,
+        -4.0f,
+        -6.0f};
+
+    return LUT[__x];
+  }
+  uint8_t __x{0};
+};
--- a/mlx/backend/cuda/quantized/fp_quantize.cu
+++ b/mlx/backend/cuda/quantized/fp_quantize.cu
@@ -0,0 +1,216 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/backend/cuda/quantized/quantized.h"
+#include "mlx/dtype_utils.h"
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <cuda_fp4.h>
+#include <cuda_fp8.h>
+
+namespace mlx::core {
+namespace cu {
+
+template <int bits>
+struct Quantize {
+  __device__ uint8_t operator()(float x) {
+    if constexpr (bits == 8) {
+      return __nv_fp8_e4m3(x).__x;
+    } else {
+      return __nv_fp4_e2m1(x).__x;
+    }
+  }
+};
+
+template <int bits>
+struct Dequantize {
+  __device__ float operator()(uint8_t x) {
+    if constexpr (bits == 8) {
+      return float(*(__nv_fp8_e4m3*)(&x));
+    } else {
+      return float(*(__nv_fp4_e2m1*)(&x));
+    }
+  }
+};
+
+namespace cg = cooperative_groups;
+
+template <typename T, int group_size, int bits, bool use_mx_scale>
+__global__ void
+fp_quantize(const T* w, uint8_t* out, uint8_t* scales, size_t size) {
+  auto block_size = cg::this_thread_block().dim_threads();
+  auto block_idx = cg::this_thread_block().group_index();
+  auto idx_in_block = cg::this_thread_block().thread_index();
+
+  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
+  auto tidy = block_idx.y * block_size.y + idx_in_block.y;
+
+  auto grid_dim_x =
+      cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
+  size_t index = tidx + grid_dim_x * size_t(tidy);
+  if (index >= size) {
+    return;
+  }
+
+  float w_thread = w[index];
+
+  cg::greater<float> max_op;
+  auto warp = cg::tiled_partition<group_size>(cg::this_thread_block());
+
+  float scale = cg::reduce(warp, abs(w_thread), max_op);
+  scale /= bits == 4 ? 6.0f : 448.0f;
+  // Convert to mx scale or nv scale
+  using ScaleType =
+      std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
+  auto s = ScaleType(scale);
+  uint8_t q_scale = s.__x;
+  scale = float(s);
+
+  // Write out the scales
+  size_t gindex = index / group_size;
+  if (index % group_size == 0) {
+    scales[gindex] = q_scale;
+  }
+
+  uint8_t output = Quantize<bits>{}(scale == 0 ? 0.0f : w_thread / scale);
+  if (bits == 4) {
+    uint8_t sval = warp.shfl_down(output, 1);
+    output |= sval << bits;
+  }
+  constexpr int pack_factor = bits == 8 ? 1 : 2;
+  if (index % pack_factor == 0) {
+    out[index / pack_factor] = output;
+  }
+}
+
+template <typename T, int group_size, int bits, bool use_mx_scale>
+__global__ void
+fp_dequantize(const uint8_t* w, const uint8_t* scales, T* out, size_t size) {
+  auto block_size = cg::this_thread_block().dim_threads();
+  auto block_idx = cg::this_thread_block().group_index();
+  auto idx_in_block = cg::this_thread_block().thread_index();
+
+  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
+  auto tidy = block_idx.y * block_size.y + idx_in_block.y;
+
+  auto grid_dim_x =
+      cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
+
+  constexpr int pack_factor = bits == 8 ? 1 : 2;
+  size_t offset = tidx + grid_dim_x * size_t(tidy);
+  size_t oindex = offset * pack_factor;
+
+  if (oindex >= size) {
+    return;
+  }
+
+  size_t gindex = oindex / group_size;
+  using ScaleType =
+      std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
+  auto scale = float(((ScaleType*)(scales))[gindex]);
+
+  out += oindex;
+
+  uint val = w[offset];
+#pragma clang loop unroll(full)
+  for (int i = 0; i < pack_factor; i++) {
+    uint8_t d;
+    if (bits == 4) {
+      d = (val >> (bits * i)) & 0x0f;
+    } else if (bits == 8) {
+      d = val;
+    }
+    out[i] = static_cast<T>(scale * Dequantize<bits>{}(d));
+  }
+}
+
+} // namespace cu
+
+void fp_quantize(
+    const array& w,
+    array& wq,
+    array& scales,
+    int group_size,
+    int bits,
+    cu::CommandEncoder& enc,
+    const Stream& s) {
+  enc.set_input_array(w);
+  enc.set_output_array(wq);
+  enc.set_output_array(scales);
+  dispatch_float_types(w.dtype(), "fp_quantize", [&](auto type_tag) {
+    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    if constexpr (!std::is_same_v<T, double>) {
+      auto kernel = cu::fp_quantize<T, 32, 4, true>;
+      if (bits == 8) {
+        kernel = cu::fp_quantize<T, 32, 8, true>;
+      } else if (group_size == 16) {
+        kernel = cu::fp_quantize<T, 16, 4, false>;
+      }
+      bool large = w.size() > UINT_MAX;
+      auto [num_blocks, block_dims] =
+          get_launch_args(w.size(), w.shape(), w.strides(), large);
+      enc.add_kernel_node(
+          kernel,
+          num_blocks,
+          block_dims,
+          0,
+          w.data<T>(),
+          wq.data<uint8_t>(),
+          scales.data<uint8_t>(),
+          w.size());
+    } else {
+      throw std::runtime_error(
+          "[Quantize::eval_gpu] Can not quantize input with type float64.");
+    }
+  });
+}
+
+void fp_dequantize(
+    const array& wq,
+    const array& scales,
+    array& w,
+    int group_size,
+    int bits,
+    cu::CommandEncoder& enc,
+    const Stream& s) {
+  constexpr int uint8_per_uint32 = 4;
+  int packs_per_int = 8 / bits;
+
+  size_t size = w.size() / packs_per_int;
+  bool large = size > UINT_MAX;
+  auto grid_shape = w.shape();
+  grid_shape.back() *= uint8_per_uint32;
+
+  enc.set_input_array(wq);
+  enc.set_input_array(scales);
+  enc.set_output_array(w);
+  dispatch_float_types(w.dtype(), "fp_dequantize", [&](auto type_tag) {
+    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    if constexpr (!std::is_same_v<T, double>) {
+      auto kernel = cu::fp_dequantize<T, 32, 4, true>;
+      if (bits == 8) {
+        kernel = cu::fp_dequantize<T, 32, 8, true>;
+      } else if (group_size == 16) {
+        kernel = cu::fp_dequantize<T, 16, 4, false>;
+      }
+      auto [num_blocks, block_dims] =
+          get_launch_args(size, grid_shape, w.strides(), large);
+      enc.add_kernel_node(
+          kernel,
+          num_blocks,
+          block_dims,
+          0,
+          wq.data<uint8_t>(),
+          scales.data<T>(),
+          w.data<T>(),
+          w.size());
+    } else {
+      throw std::runtime_error(
+          "[Quantize::eval_gpu] Can not dequantize to output with type float64.");
+    }
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/quantized/quantized.cpp
+++ b/mlx/backend/cuda/quantized/quantized.cpp
@@ -57,23 +57,30 @@ void fast::Quantize::eval_gpu(
  if (dequantize_) {
    auto wq = ensure_row_contiguous(inputs[0], enc, s);
    auto scales = ensure_row_contiguous(inputs[1], enc, s);
-    auto biases = ensure_row_contiguous(inputs[2], enc, s);
    auto& w = outputs[0];

    w.set_data(allocator::malloc(w.nbytes()));

-    affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);
+    if (mode_ == QuantizationMode::Affine) {
+      auto biases = ensure_row_contiguous(inputs[2], enc, s);
+      affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);
+    } else {
+      fp_dequantize(wq, scales, w, group_size_, bits_, enc, s);
+    }
  } else {
    auto w = ensure_row_contiguous(inputs[0], enc, s);
    auto& wq = outputs[0];
    auto& scales = outputs[1];
-    auto& biases = outputs[2];

    wq.set_data(allocator::malloc(wq.nbytes()));
    scales.set_data(allocator::malloc(scales.nbytes()));
-    biases.set_data(allocator::malloc(biases.nbytes()));
-
-    affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
+    if (mode_ == QuantizationMode::Affine) {
+      auto& biases = outputs[2];
+      biases.set_data(allocator::malloc(biases.nbytes()));
+      affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
+    } else {
+      fp_quantize(w, wq, scales, group_size_, bits_, enc, s);
+    }
  }
 }

--- a/mlx/backend/cuda/quantized/quantized.h
+++ b/mlx/backend/cuda/quantized/quantized.h
@@ -24,4 +24,22 @@ void affine_dequantize(
    cu::CommandEncoder& enc,
    const Stream& s);

+void fp_quantize(
+    const array& w,
+    array& wq,
+    array& scales,
+    int group_size,
+    int bits,
+    cu::CommandEncoder& enc,
+    const Stream& s);
+
+void fp_dequantize(
+    const array& wq,
+    const array& scales,
+    array& w,
+    int group_size,
+    int bits,
+    cu::CommandEncoder& enc,
+    const Stream& s);
+
 } // namespace mlx::core