Files
mlx/mlx/backend/cuda/quantized/quantized.cpp

88 lines
2.3 KiB
C++
Raw Normal View History

2025-07-30 08:27:20 -07:00
// Copyright © 2025 Apple Inc.
#include "mlx/backend/cuda/quantized/quantized.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/fast_primitives.h"
#include <nvtx3/nvtx3.hpp>
2025-07-30 08:27:20 -07:00
namespace mlx::core {
namespace {
inline array ensure_row_contiguous(
const array& x,
cu::CommandEncoder& enc,
const Stream& s) {
if (!x.flags().row_contiguous) {
array x_copy = contiguous_copy_gpu(x, s);
enc.add_temporary(x_copy);
return x_copy;
} else {
return x;
}
}
inline array ensure_row_contiguous_matrix(
const array& x,
cu::CommandEncoder& enc,
const Stream& s) {
if (x.ndim() < 2) {
if (x.strides()[0] == 1) {
return x;
}
2025-07-30 08:27:20 -07:00
} else {
auto stride_0 = x.strides()[x.ndim() - 2];
auto stride_1 = x.strides()[x.ndim() - 1];
if (stride_0 == x.shape(-1) && stride_1 == 1) {
return x;
}
2025-07-30 08:27:20 -07:00
}
array x_copy = contiguous_copy_gpu(x, s);
enc.add_temporary(x_copy);
return x_copy;
2025-07-30 08:27:20 -07:00
}
} // namespace
void fast::Quantize::eval_gpu(
2025-07-30 08:27:20 -07:00
const std::vector<array>& inputs,
std::vector<array>& outputs) {
nvtx3::scoped_range r("Quantize::eval_gpu");
2025-07-30 08:27:20 -07:00
auto& s = stream();
auto& d = cu::device(s.device);
auto& enc = d.get_command_encoder(s);
if (dequantize_) {
auto wq = ensure_row_contiguous(inputs[0], enc, s);
auto scales = ensure_row_contiguous(inputs[1], enc, s);
auto& w = outputs[0];
w.set_data(cu::malloc_async(w.nbytes(), enc.stream()));
2025-07-30 08:27:20 -07:00
if (mode_ == QuantizationMode::Affine) {
auto biases = ensure_row_contiguous(inputs[2], enc, s);
affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);
} else {
fp_dequantize(wq, scales, w, group_size_, bits_, enc, s);
}
2025-07-30 08:27:20 -07:00
} else {
auto w = ensure_row_contiguous(inputs[0], enc, s);
auto& wq = outputs[0];
auto& scales = outputs[1];
wq.set_data(cu::malloc_async(wq.nbytes(), enc.stream()));
scales.set_data(cu::malloc_async(scales.nbytes(), enc.stream()));
if (mode_ == QuantizationMode::Affine) {
auto& biases = outputs[2];
biases.set_data(cu::malloc_async(biases.nbytes(), enc.stream()));
affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
} else {
fp_quantize(w, wq, scales, group_size_, bits_, enc, s);
}
2025-07-30 08:27:20 -07:00
}
}
} // namespace mlx::core