feat(mlx-lm): add de-quant for fuse.py (#365)

* feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found
2025-09-01 12:49:50 +08:00 · 2024-01-26 13:59:32 +11:00
parent f51e98fcf1
commit 854ad8747a
4 changed files with 70 additions and 10 deletions
--- a/llms/mlx_lm/fuse.py
+++ b/llms/mlx_lm/fuse.py
@@ -8,7 +8,7 @@ from typing import Any, Dict, Union
 from mlx.utils import tree_flatten, tree_unflatten

 from .tuner.lora import LoRALinear
-from .tuner.utils import apply_lora_layers
+from .tuner.utils import apply_lora_layers, dequantize
 from .utils import fetch_from_hub, get_model_path, save_weights, upload_to_hub


@@ -42,6 +42,11 @@ def parse_arguments() -> argparse.Namespace:
        type=str,
        default=None,
    )
+    parser.add_argument(
+        "--de-quantize",
+        help="Generate a de-quantized model.",
+        action="store_true",
+    )
    return parser.parse_args()


@@ -54,6 +59,7 @@ def main() -> None:

    model.freeze()
    model = apply_lora_layers(model, args.adapter_file)
+
    fused_linears = [
        (n, m.to_linear())
        for n, m in model.named_modules()
@@ -61,6 +67,11 @@ def main() -> None:
    ]

    model.update_modules(tree_unflatten(fused_linears))
+
+    if args.de_quantize:
+        print("De-quantizing model")
+        model = dequantize(model)
+
    weights = dict(tree_flatten(model.parameters()))

    save_path = Path(args.save_path)
@@ -73,6 +84,9 @@ def main() -> None:

    tokenizer.save_pretrained(save_path)

+    if args.de_quantize:
+        config.pop("quantization", None)
+
    with open(save_path / "config.json", "w") as fid:
        json.dump(config, fid, indent=4)