mlx-examples/llms/mlx_lm/convert.py

import argparse
import copy
import glob
import json
import shutil
from pathlib import Path
from typing import Tuple

import mlx.core as mx
import mlx.nn as nn
from mlx.utils import tree_flatten

from .utils import (
    fetch_from_hub,
    get_model_path,
    linear_class_predicate,
    save_weights,
    upload_to_hub,
)


def configure_parser() -> argparse.ArgumentParser:
    """
    Configures and returns the argument parser for the script.

    Returns:
        argparse.ArgumentParser: Configured argument parser.
    """
    parser = argparse.ArgumentParser(
        description="Convert Hugging Face model to MLX format"
    )

    parser.add_argument("--hf-path", type=str, help="Path to the Hugging Face model.")
    parser.add_argument(
        "--mlx-path", type=str, default="mlx_model", help="Path to save the MLX model."
    )
    parser.add_argument(
        "-q", "--quantize", help="Generate a quantized model.", action="store_true"
    )
    parser.add_argument(
        "--q-group-size", help="Group size for quantization.", type=int, default=64
    )
    parser.add_argument(
        "--q-bits", help="Bits per weight for quantization.", type=int, default=4
    )
    parser.add_argument(
        "--dtype",
        help="Type to save the parameters, ignored if -q is given.",
        type=str,
        choices=["float16", "bfloat16", "float32"],
        default="float16",
    )
    parser.add_argument(
        "--upload-repo",
        help="The Hugging Face repo to upload the model to.",
        type=str,
        default=None,
    )
    return parser


def quantize_model(
    model: nn.Module, config: dict, q_group_size: int, q_bits: int
) -> Tuple:
    """
    Applies quantization to the model weights.

    Args:
        model (nn.Module): The model to be quantized.
        config (dict): Model configuration.
        q_group_size (int): Group size for quantization.
        q_bits (int): Bits per weight for quantization.

    Returns:
        Tuple: Tuple containing quantized weights and config.
    """
    quantized_config = copy.deepcopy(config)

    nn.QuantizedLinear.quantize_module(
        model, q_group_size, q_bits, linear_class_predicate=linear_class_predicate
    )
    quantized_config["quantization"] = {"group_size": q_group_size, "bits": q_bits}
    quantized_weights = dict(tree_flatten(model.parameters()))

    return quantized_weights, quantized_config


def convert(
    hf_path: str,
    mlx_path: str = "mlx_model",
    quantize: bool = False,
    q_group_size: int = 64,
    q_bits: int = 4,
    dtype: str = "float16",
    upload_repo: str = None,
):
    print("[INFO] Loading")
    model_path = get_model_path(hf_path)
    model, config, tokenizer = fetch_from_hub(model_path, lazy=True)

    weights = dict(tree_flatten(model.parameters()))
    dtype = mx.float16 if quantize else getattr(mx, dtype)
    weights = {k: v.astype(dtype) for k, v in weights.items()}

    if quantize:
        print("[INFO] Quantizing")
        model.load_weights(list(weights.items()))
        weights, config = quantize_model(model, config, q_group_size, q_bits)

    if isinstance(mlx_path, str):
        mlx_path = Path(mlx_path)

    del model
    save_weights(mlx_path, weights, donate_weights=True)

    py_files = glob.glob(str(model_path / "*.py"))
    for file in py_files:
        shutil.copy(file, mlx_path)

    tokenizer.save_pretrained(mlx_path)

    with open(mlx_path / "config.json", "w") as fid:
        json.dump(config, fid, indent=4)

    if upload_repo is not None:
        upload_to_hub(mlx_path, upload_repo, hf_path)


if __name__ == "__main__":
    parser = configure_parser()
    args = parser.parse_args()
    convert(**vars(args))
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`import argparse`
			`import copy`
			`import glob`
			`import json`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`import shutil`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`from pathlib import Path`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`from typing import Tuple`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00
			`import mlx.core as mx`
			`import mlx.nn as nn`
			`from mlx.utils import tree_flatten`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`from .utils import (`
			`fetch_from_hub,`
			`get_model_path,`
			`linear_class_predicate,`
			`save_weights,`
			`upload_to_hub,`
			`)`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00

			`def configure_parser() -> argparse.ArgumentParser:`
			`"""`
			`Configures and returns the argument parser for the script.`

			`Returns:`
			`argparse.ArgumentParser: Configured argument parser.`
			`"""`
			`parser = argparse.ArgumentParser(`
			`description="Convert Hugging Face model to MLX format"`
			`)`

			`parser.add_argument("--hf-path", type=str, help="Path to the Hugging Face model.")`
			`parser.add_argument(`
			`"--mlx-path", type=str, default="mlx_model", help="Path to save the MLX model."`
			`)`
			`parser.add_argument(`
			`"-q", "--quantize", help="Generate a quantized model.", action="store_true"`
			`)`
			`parser.add_argument(`
			`"--q-group-size", help="Group size for quantization.", type=int, default=64`
			`)`
			`parser.add_argument(`
			`"--q-bits", help="Bits per weight for quantization.", type=int, default=4`
			`)`
			`parser.add_argument(`
			`"--dtype",`
			`help="Type to save the parameters, ignored if -q is given.",`
			`type=str,`
			`choices=["float16", "bfloat16", "float32"],`
			`default="float16",`
			`)`
			`parser.add_argument(`
			`"--upload-repo",`
			`help="The Hugging Face repo to upload the model to.",`
			`type=str,`
			`default=None,`
			`)`
			`return parser`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00

Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`def quantize_model(`
chore: fix the convert.py script for weights are not sanitized and support quant for non-32 dimensions (#340) * chore: fix convert script for weights not sanitized and suport quant for non 32 dim * Update llms/mlx_lm/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * chore: fix typo --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-20 13:07:21 +08:00			`model: nn.Module, config: dict, q_group_size: int, q_bits: int`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`) -> Tuple:`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`"""`
			`Applies quantization to the model weights.`

			`Args:`
chore: fix the convert.py script for weights are not sanitized and support quant for non-32 dimensions (#340) * chore: fix convert script for weights not sanitized and suport quant for non 32 dim * Update llms/mlx_lm/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * chore: fix typo --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-20 13:07:21 +08:00			`model (nn.Module): The model to be quantized.`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`config (dict): Model configuration.`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`q_group_size (int): Group size for quantization.`
			`q_bits (int): Bits per weight for quantization.`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`Returns:`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`Tuple: Tuple containing quantized weights and config.`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`"""`
			`quantized_config = copy.deepcopy(config)`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00			`nn.QuantizedLinear.quantize_module(`
			`model, q_group_size, q_bits, linear_class_predicate=linear_class_predicate`
			`)`
			`quantized_config["quantization"] = {"group_size": q_group_size, "bits": q_bits}`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`quantized_weights = dict(tree_flatten(model.parameters()))`

			`return quantized_weights, quantized_config`


Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`def convert(`
			`hf_path: str,`
			`mlx_path: str = "mlx_model",`
			`quantize: bool = False,`
			`q_group_size: int = 64,`
			`q_bits: int = 4,`
			`dtype: str = "float16",`
			`upload_repo: str = None,`
			`):`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`print("[INFO] Loading")`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`model_path = get_model_path(hf_path)`
Lazy loading models for faster convert and merge (#462) 2024-02-21 05:36:55 +08:00			`model, config, tokenizer = fetch_from_hub(model_path, lazy=True)`
chore: fix the convert.py script for weights are not sanitized and support quant for non-32 dimensions (#340) * chore: fix convert script for weights not sanitized and suport quant for non 32 dim * Update llms/mlx_lm/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * chore: fix typo --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-20 13:07:21 +08:00
			`weights = dict(tree_flatten(model.parameters()))`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`dtype = mx.float16 if quantize else getattr(mx, dtype)`
force fp16 for quantized models (#240) 2024-01-06 13:29:15 +08:00			`weights = {k: v.astype(dtype) for k, v in weights.items()}`
chore: fix the convert.py script for weights are not sanitized and support quant for non-32 dimensions (#340) * chore: fix convert script for weights not sanitized and suport quant for non 32 dim * Update llms/mlx_lm/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * chore: fix typo --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-20 13:07:21 +08:00
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`if quantize:`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`print("[INFO] Quantizing")`
chore: fix the convert.py script for weights are not sanitized and support quant for non-32 dimensions (#340) * chore: fix convert script for weights not sanitized and suport quant for non 32 dim * Update llms/mlx_lm/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * chore: fix typo --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-20 13:07:21 +08:00			`model.load_weights(list(weights.items()))`
			`weights, config = quantize_model(model, config, q_group_size, q_bits)`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`if isinstance(mlx_path, str):`
			`mlx_path = Path(mlx_path)`

Lazy loading models for faster convert and merge (#462) 2024-02-21 05:36:55 +08:00			`del model`
			`save_weights(mlx_path, weights, donate_weights=True)`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00
			`py_files = glob.glob(str(model_path / "*.py"))`
			`for file in py_files:`
			`shutil.copy(file, mlx_path)`

Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`tokenizer.save_pretrained(mlx_path)`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`with open(mlx_path / "config.json", "w") as fid:`
			`json.dump(config, fid, indent=4)`

Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`if upload_repo is not None:`
			`upload_to_hub(mlx_path, upload_repo, hf_path)`


			`if __name__ == "__main__":`
			`parser = configure_parser()`
			`args = parser.parse_args()`
			`convert(**vars(args))`