Change the argument name to quantization_type

This commit is contained in:
Angelos Katharopoulos
2024-12-16 13:31:34 -08:00
parent f5da489a3c
commit 410ccdbed5
4 changed files with 78 additions and 50 deletions

View File

@@ -4025,7 +4025,7 @@ void init_ops(nb::module_& m) {
bool transpose,
int group_size,
int bits,
const std::string& type,
const std::string& quantization_type,
mx::StreamOrDevice s) {
return mx::quantized_matmul(
std::move(x),
@@ -4035,7 +4035,7 @@ void init_ops(nb::module_& m) {
transpose,
group_size,
bits,
mx::from_string(type),
mx::from_string(quantization_type),
s);
},
nb::arg(),
@@ -4045,11 +4045,11 @@ void init_ops(nb::module_& m) {
"transpose"_a = true,
"group_size"_a = 64,
"bits"_a = 4,
"type"_a = "affine",
"quantization_type"_a = "affine",
nb::kw_only(),
"stream"_a = nb::none(),
nb::sig(
"def quantized_matmul(x: array, w: array, /, scales: array, biases: Optional[array], transpose: bool = True, group_size: int = 64, bits: int = 4, type: str = 'affine', *, stream: Union[None, Stream, Device] = None) -> array"),
"def quantized_matmul(x: array, w: array, /, scales: array, biases: Optional[array], transpose: bool = True, group_size: int = 64, bits: int = 4, quantization_type: str = 'affine', *, stream: Union[None, Stream, Device] = None) -> array"),
R"pbdoc(
Perform the matrix multiplication with the quantized matrix ``w``. The
quantization uses one floating point scale and bias per ``group_size`` of
@@ -4069,7 +4069,7 @@ void init_ops(nb::module_& m) {
shares a scale and bias. Default: ``64``.
bits (int, optional): The number of bits occupied by each element in
``w``. Default: ``4``.
type (str, optional): The type of quantization used for the matrix.
quantization_type (str, optional): The type of quantization used for the matrix.
It can be 'affine' or 'affine-packed'.
Returns:
@@ -4080,18 +4080,19 @@ void init_ops(nb::module_& m) {
[](const mx::array& w,
int group_size,
int bits,
const std::string& type,
const std::string& quantization_type,
mx::StreamOrDevice s) {
return mx::quantize(w, group_size, bits, mx::from_string(type), s);
return mx::quantize(
w, group_size, bits, mx::from_string(quantization_type), s);
},
nb::arg(),
"group_size"_a = 64,
"bits"_a = 4,
"type"_a = "affine",
"quantization_type"_a = "affine",
nb::kw_only(),
"stream"_a = nb::none(),
nb::sig(
"def quantize(w: array, /, group_size: int = 64, bits : int = 4, type: str = 'affine', *, stream: Union[None, Stream, Device] = None) -> tuple[array, array, Optional[array]]"),
"def quantize(w: array, /, group_size: int = 64, bits : int = 4, quantization_type: str = 'affine', *, stream: Union[None, Stream, Device] = None) -> tuple[array, array, Optional[array]]"),
R"pbdoc(
Quantize the matrix ``w`` using ``bits`` bits per element.
@@ -4133,7 +4134,7 @@ void init_ops(nb::module_& m) {
scale and bias. Default: ``64``.
bits (int, optional): The number of bits occupied by each element of
``w`` in the returned quantized matrix. Default: ``4``.
type (str, optional): The type of quantization used for the matrix.
quantization_type (str, optional): The type of quantization used for the matrix.
It can be 'affine' or 'affine-packed'.
Returns:
@@ -4152,21 +4153,27 @@ void init_ops(nb::module_& m) {
const std::optional<mx::array>& biases,
int group_size,
int bits,
const std::string& type,
const std::string& quantization_type,
mx::StreamOrDevice s) {
return mx::dequantize(
wq, scales, biases, group_size, bits, mx::from_string(type), s);
wq,
scales,
biases,
group_size,
bits,
mx::from_string(quantization_type),
s);
},
nb::arg(),
"scales"_a,
"biases"_a,
"group_size"_a = 64,
"bits"_a = 4,
"type"_a = "affine",
"quantization_type"_a = "affine",
nb::kw_only(),
"stream"_a = nb::none(),
nb::sig(
"def dequantize(w: array, /, scales: array, biases: Optional[array], group_size: int = 64, bits: int = 4, type: str = 'affine', *, stream: Union[None, Stream, Device] = None) -> array"),
"def dequantize(w: array, /, scales: array, biases: Optional[array], group_size: int = 64, bits: int = 4, quantization_type: str = 'affine', *, stream: Union[None, Stream, Device] = None) -> array"),
R"pbdoc(
Dequantize the matrix ``w`` using the provided ``scales`` and
``biases`` and the ``group_size`` and ``bits`` configuration.
@@ -4187,7 +4194,7 @@ void init_ops(nb::module_& m) {
scale and bias. Default: ``64``.
bits (int, optional): The number of bits occupied by each element in
``w``. Default: ``4``.
type (str, optional): The type of quantization used for the matrix.
quantization_type (str, optional): The type of quantization used for the matrix.
It can be 'affine' or 'affine-packed'.
Returns:
@@ -4205,7 +4212,7 @@ void init_ops(nb::module_& m) {
"transpose"_a = true,
"group_size"_a = 64,
"bits"_a = 4,
"type"_a = "affine",
"quantization_type"_a = "affine",
nb::kw_only(),
"stream"_a = nb::none(),
nb::sig(
@@ -4235,7 +4242,7 @@ void init_ops(nb::module_& m) {
shares a scale and bias. Default: ``64``.
bits (int, optional): The number of bits occupied by each element in
``w``. Default: ``4``.
type (str, optional): The type of quantization used for the matrix.
quantization_type (str, optional): The type of quantization used for the matrix.
It can be 'affine' or 'affine-packed'.
Returns: