Add optional quantization types

2025-12-16 02:08:55 +08:00 · 2024-12-17 22:24:41 -08:00
parent 845efddc8c
commit bc08025f41
2 changed files with 34 additions and 4 deletions
--- a/llms/mlx_lm/convert.py
+++ b/llms/mlx_lm/convert.py
@@ -29,6 +29,12 @@ def configure_parser() -> argparse.ArgumentParser:
    parser.add_argument(
        "--q-bits", help="Bits per weight for quantization.", type=int, default=4
    )
+    parser.add_argument(
+        "--q-type",
+        choices=["affine", "affine-packed"],
+        default="affine",
+        help="The type of quantization to apply",
+    )
    parser.add_argument(
        "--dtype",
        help="Type to save the non-quantized parameters.",
--- a/llms/mlx_lm/utils.py
+++ b/llms/mlx_lm/utils.py
@@ -528,6 +528,7 @@ def load_model(
            model,
            group_size=quantization["group_size"],
            bits=quantization["bits"],
+            quantization_type=quantization["quantization_type"],
            class_predicate=class_predicate,
        )

@@ -737,6 +738,7 @@ def quantize_model(
    config: dict,
    q_group_size: int,
    q_bits: int,
+    q_type: str,
    quant_predicate: Optional[
        Callable[[str, nn.Module, dict], Union[bool, dict]]
    ] = None,
@@ -749,6 +751,7 @@ def quantize_model(
        config (dict): Model configuration.
        q_group_size (int): Group size for quantization.
        q_bits (int): Bits per weight for quantization.
+        q_type (str): Quantization type
        quant_predicate (Callable): A callable that decides how
            to quantize each layer based on the path.
            Accepts the layer `path`, the `module` and the model `config`.
@@ -759,11 +762,25 @@ def quantize_model(
        Tuple: Tuple containing quantized weights and config.
    """
    quantized_config = copy.deepcopy(config)
-    quantized_config["quantization"] = {"group_size": q_group_size, "bits": q_bits}
+    quantized_config["quantization"] = {
+        "group_size": q_group_size,
+        "bits": q_bits,
+        "quantization_type": q_type,
+    }

    # Add any custom quantization parameters to the config as we go
    def _class_predicate(p, m):
+        if quant_predicate:
            bool_or_params = quant_predicate(p, m, config)
+        else:
+            if isinstance(m, nn.Embedding):
+                bool_or_params = {
+                    "group_size": q_group_size,
+                    "bits": q_bits,
+                    "quantization_type": "affine",
+                }
+            else:
+                bool_or_params = hasattr(m, "to_quantized")
        quantized_config["quantization"][p] = bool_or_params
        return bool_or_params

@@ -771,7 +788,8 @@ def quantize_model(
        model,
        q_group_size,
        q_bits,
-        class_predicate=_class_predicate if quant_predicate else None,
+        quantization_type=q_type,
+        class_predicate=_class_predicate,
    )
    # support hf model tree #957
    quantized_config["quantization_config"] = quantized_config["quantization"]
@@ -812,6 +830,7 @@ def convert(
    quantize: bool = False,
    q_group_size: int = 64,
    q_bits: int = 4,
+    q_type: str = "affine",
    dtype: str = "float16",
    upload_repo: str = None,
    revision: Optional[str] = None,
@@ -845,7 +864,12 @@ def convert(
        print("[INFO] Quantizing")
        model.load_weights(list(weights.items()))
        weights, config = quantize_model(
-            model, config, q_group_size, q_bits, quant_predicate=quant_predicate
+            model,
+            config,
+            q_group_size,
+            q_bits,
+            q_type=q_type,
+            quant_predicate=quant_predicate,
        )

    if dequantize: