mlx/backend/cuda/quantized/quantized.cpp

// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/quantized/quantized.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/fast_primitives.h"

#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace {

inline array ensure_row_contiguous(
    const array& x,
    cu::CommandEncoder& enc,
    const Stream& s) {
  if (!x.flags().row_contiguous) {
    array x_copy = contiguous_copy_gpu(x, s);
    enc.add_temporary(x_copy);
    return x_copy;
  } else {
    return x;
  }
}

inline array ensure_row_contiguous_matrix(
    const array& x,
    cu::CommandEncoder& enc,
    const Stream& s) {
  if (x.ndim() < 2) {
    if (x.strides()[0] == 1) {
      return x;
    }
  } else {
    auto stride_0 = x.strides()[x.ndim() - 2];
    auto stride_1 = x.strides()[x.ndim() - 1];
    if (stride_0 == x.shape(-1) && stride_1 == 1) {
      return x;
    }
  }
  array x_copy = contiguous_copy_gpu(x, s);
  enc.add_temporary(x_copy);
  return x_copy;
}

} // namespace

void fast::Quantize::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("Quantize::eval_gpu");
  auto& s = stream();
  auto& d = cu::device(s.device);
  auto& enc = d.get_command_encoder(s);

  if (dequantize_) {
    auto wq = ensure_row_contiguous(inputs[0], enc, s);
    auto scales = ensure_row_contiguous(inputs[1], enc, s);
    auto& w = outputs[0];

    w.set_data(cu::malloc_async(w.nbytes(), enc.stream()));

    if (mode_ == QuantizationMode::Affine) {
      auto biases = ensure_row_contiguous(inputs[2], enc, s);
      affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);
    } else {
      fp_dequantize(wq, scales, w, group_size_, bits_, enc, s);
    }
  } else {
    auto w = ensure_row_contiguous(inputs[0], enc, s);
    auto& wq = outputs[0];
    auto& scales = outputs[1];

    wq.set_data(cu::malloc_async(wq.nbytes(), enc.stream()));
    scales.set_data(cu::malloc_async(scales.nbytes(), enc.stream()));
    if (mode_ == QuantizationMode::Affine) {
      auto& biases = outputs[2];
      biases.set_data(cu::malloc_async(biases.nbytes(), enc.stream()));
      affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
    } else {
      fp_quantize(w, wq, scales, group_size_, bits_, enc, s);
    }
  }
}

} // namespace mlx::core
[CUDA] Quantized refactoring (#2442) 2025-07-30 08:27:20 -07:00			`// Copyright © 2025 Apple Inc.`

			`#include "mlx/backend/cuda/quantized/quantized.h"`
			`#include "mlx/backend/cuda/device.h"`
			`#include "mlx/backend/gpu/copy.h"`
			`#include "mlx/fast_primitives.h"`

[CUDA] Save primitive inputs faster (#2449) * Add more nvtx loggings * [CUDA] Saving primitive inputs faster * Remove unneeded check 2025-08-01 10:16:06 +09:00			`#include <nvtx3/nvtx3.hpp>`

[CUDA] Quantized refactoring (#2442) 2025-07-30 08:27:20 -07:00			`namespace mlx::core {`

			`namespace {`

			`inline array ensure_row_contiguous(`
			`const array& x,`
			`cu::CommandEncoder& enc,`
			`const Stream& s) {`
			`if (!x.flags().row_contiguous) {`
			`array x_copy = contiguous_copy_gpu(x, s);`
			`enc.add_temporary(x_copy);`
			`return x_copy;`
			`} else {`
			`return x;`
			`}`
			`}`

			`inline array ensure_row_contiguous_matrix(`
			`const array& x,`
			`cu::CommandEncoder& enc,`
			`const Stream& s) {`
Use SmallVector for shapes and strides (#2454) * Use SmallVector for shapes and strides * Convert SmallVector to tuple 2025-08-05 09:41:03 +09:00			`if (x.ndim() < 2) {`
			`if (x.strides()[0] == 1) {`
			`return x;`
			`}`
[CUDA] Quantized refactoring (#2442) 2025-07-30 08:27:20 -07:00			`} else {`
Use SmallVector for shapes and strides (#2454) * Use SmallVector for shapes and strides * Convert SmallVector to tuple 2025-08-05 09:41:03 +09:00			`auto stride_0 = x.strides()[x.ndim() - 2];`
			`auto stride_1 = x.strides()[x.ndim() - 1];`
			`if (stride_0 == x.shape(-1) && stride_1 == 1) {`
			`return x;`
			`}`
[CUDA] Quantized refactoring (#2442) 2025-07-30 08:27:20 -07:00			`}`
Use SmallVector for shapes and strides (#2454) * Use SmallVector for shapes and strides * Convert SmallVector to tuple 2025-08-05 09:41:03 +09:00			`array x_copy = contiguous_copy_gpu(x, s);`
			`enc.add_temporary(x_copy);`
			`return x_copy;`
[CUDA] Quantized refactoring (#2442) 2025-07-30 08:27:20 -07:00			`}`

			`} // namespace`

Add mode parameter for quantization (#2499) * add mode parameter for quantization * mxfp4 quantize/dequantize + start of optional biases * mxfp4 works * speedup * cpu mxfp4 * fix * fix test tol * fix * refactor * add quant mode enum 2025-08-28 06:45:26 -07:00			`void fast::Quantize::eval_gpu(`
[CUDA] Quantized refactoring (#2442) 2025-07-30 08:27:20 -07:00			`const std::vector<array>& inputs,`
			`std::vector<array>& outputs) {`
Add mode parameter for quantization (#2499) * add mode parameter for quantization * mxfp4 quantize/dequantize + start of optional biases * mxfp4 works * speedup * cpu mxfp4 * fix * fix test tol * fix * refactor * add quant mode enum 2025-08-28 06:45:26 -07:00			`nvtx3::scoped_range r("Quantize::eval_gpu");`
[CUDA] Quantized refactoring (#2442) 2025-07-30 08:27:20 -07:00			`auto& s = stream();`
			`auto& d = cu::device(s.device);`
			`auto& enc = d.get_command_encoder(s);`

			`if (dequantize_) {`
			`auto wq = ensure_row_contiguous(inputs[0], enc, s);`
			`auto scales = ensure_row_contiguous(inputs[1], enc, s);`
			`auto& w = outputs[0];`

[CUDA] Reduce use of managed memory (#2725) * Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment 2025-11-05 16:05:23 -08:00			`w.set_data(cu::malloc_async(w.nbytes(), enc.stream()));`
[CUDA] Quantized refactoring (#2442) 2025-07-30 08:27:20 -07:00
Add quantize/dequantize for mxfp8 and nvfp4 (#2688) * Add quantize/dequantize slow path for mxfp8 and nvfp4 * fast cuda kernel for mx/nv quantization * fallback for cuda < 12.8 (#2697) * format (#2700) * fix (#2701) * metal kernels * docs * fix jit * add default bits and group sizes * improve quant docs * fix output type of mxfp4 matmuls 2025-10-28 16:23:12 -07:00			`if (mode_ == QuantizationMode::Affine) {`
			`auto biases = ensure_row_contiguous(inputs[2], enc, s);`
			`affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);`
			`} else {`
			`fp_dequantize(wq, scales, w, group_size_, bits_, enc, s);`
			`}`
[CUDA] Quantized refactoring (#2442) 2025-07-30 08:27:20 -07:00			`} else {`
			`auto w = ensure_row_contiguous(inputs[0], enc, s);`
			`auto& wq = outputs[0];`
			`auto& scales = outputs[1];`

[CUDA] Reduce use of managed memory (#2725) * Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment 2025-11-05 16:05:23 -08:00			`wq.set_data(cu::malloc_async(wq.nbytes(), enc.stream()));`
			`scales.set_data(cu::malloc_async(scales.nbytes(), enc.stream()));`
Add quantize/dequantize for mxfp8 and nvfp4 (#2688) * Add quantize/dequantize slow path for mxfp8 and nvfp4 * fast cuda kernel for mx/nv quantization * fallback for cuda < 12.8 (#2697) * format (#2700) * fix (#2701) * metal kernels * docs * fix jit * add default bits and group sizes * improve quant docs * fix output type of mxfp4 matmuls 2025-10-28 16:23:12 -07:00			`if (mode_ == QuantizationMode::Affine) {`
			`auto& biases = outputs[2];`
[CUDA] Reduce use of managed memory (#2725) * Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment 2025-11-05 16:05:23 -08:00			`biases.set_data(cu::malloc_async(biases.nbytes(), enc.stream()));`
Add quantize/dequantize for mxfp8 and nvfp4 (#2688) * Add quantize/dequantize slow path for mxfp8 and nvfp4 * fast cuda kernel for mx/nv quantization * fallback for cuda < 12.8 (#2697) * format (#2700) * fix (#2701) * metal kernels * docs * fix jit * add default bits and group sizes * improve quant docs * fix output type of mxfp4 matmuls 2025-10-28 16:23:12 -07:00			`affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);`
			`} else {`
			`fp_quantize(w, wq, scales, group_size_, bits_, enc, s);`
			`}`
[CUDA] Quantized refactoring (#2442) 2025-07-30 08:27:20 -07:00			`}`
			`}`

			`} // namespace mlx::core`