mlx-examples/llms/mlx_lm/convert.py

import argparse
import copy
import glob
import json
from pathlib import Path
from typing import Dict, Tuple

import mlx.core as mx
import mlx.nn as nn
import transformers
from mlx.utils import tree_flatten

from .utils import get_model_path, load

MAX_FILE_SIZE_GB = 15


def configure_parser() -> argparse.ArgumentParser:
    """
    Configures and returns the argument parser for the script.

    Returns:
        argparse.ArgumentParser: Configured argument parser.
    """
    parser = argparse.ArgumentParser(
        description="Convert Hugging Face model to MLX format"
    )

    parser.add_argument("--hf-path", type=str, help="Path to the Hugging Face model.")
    parser.add_argument(
        "--mlx-path", type=str, default="mlx_model", help="Path to save the MLX model."
    )
    parser.add_argument(
        "-q", "--quantize", help="Generate a quantized model.", action="store_true"
    )
    parser.add_argument(
        "--q-group-size", help="Group size for quantization.", type=int, default=64
    )
    parser.add_argument(
        "--q-bits", help="Bits per weight for quantization.", type=int, default=4
    )
    parser.add_argument(
        "--dtype",
        help="Type to save the parameters, ignored if -q is given.",
        type=str,
        choices=["float16", "bfloat16", "float32"],
        default="float16",
    )
    parser.add_argument(
        "--upload-repo",
        help="The Hugging Face repo to upload the model to.",
        type=str,
        default=None,
    )
    return parser


def fetch_from_hub(
    model_path: str,
) -> Tuple[Dict, dict, transformers.PreTrainedTokenizer]:
    model_path = get_model_path(model_path)

    weight_files = glob.glob(f"{model_path}/*.safetensors")
    if not weight_files:
        raise FileNotFoundError(f"No safetensors found in {model_path}")

    weights = {}
    for wf in weight_files:
        weights.update(mx.load(wf).items())

    config = transformers.AutoConfig.from_pretrained(model_path)
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)

    return weights, config.to_dict(), tokenizer


def quantize_model(
    weights: dict, config: dict, hf_path: str, q_group_size: int, q_bits: int
) -> tuple:
    """
    Applies quantization to the model weights.

    Args:
        weights (dict): Model weights.
        config (dict): Model configuration.
        hf_path (str): HF model path..
        q_group_size (int): Group size for quantization.
        q_bits (int): Bits per weight for quantization.

    Returns:
        tuple: Tuple containing quantized weights and config.
    """
    quantized_config = copy.deepcopy(config)
    model, _ = load(hf_path)
    model.load_weights(list(weights.items()))

    nn.QuantizedLinear.quantize_module(model, q_group_size, q_bits)
    quantized_config["quantization"] = {
        "group_size": q_group_size,
        "bits": q_bits,
    }
    quantized_weights = dict(tree_flatten(model.parameters()))

    return quantized_weights, quantized_config


def make_shards(weights: dict, max_file_size_gb: int = MAX_FILE_SIZE_GB) -> list:
    """
    Splits the weights into smaller shards.

    Args:
        weights (dict): Model weights.
        max_file_size_gb (int): Maximum size of each shard in gigabytes.

    Returns:
        list: List of weight shards.
    """
    max_file_size_bytes = max_file_size_gb << 30
    shards = []
    shard, shard_size = {}, 0
    for k, v in weights.items():
        estimated_size = v.size * v.dtype.size
        if shard_size + estimated_size > max_file_size_bytes:
            shards.append(shard)
            shard, shard_size = {}, 0
        shard[k] = v
        shard_size += estimated_size
    shards.append(shard)
    return shards


def upload_to_hub(path: str, upload_repo: str, hf_path: str):
    """
    Uploads the model to Hugging Face hub.

    Args:
        path (str): Local path to the model.
        upload_repo (str): Name of the HF repo to upload to.
        hf_path (str): Path to the original Hugging Face model.
    """
    import os

    from huggingface_hub import HfApi, ModelCard, logging

    card = ModelCard.load(hf_path)
    card.data.tags = ["mlx"] if card.data.tags is None else card.data.tags + ["mlx"]
    card.text = f"""
# {upload_repo}
This model was converted to MLX format from [`{hf_path}`]().
Refer to the [original model card](https://huggingface.co/{hf_path}) for more details on the model.
## Use with mlx

```bash
pip install mlx-lm
```

```python
from mlx_lm import load, generate

model, tokenizer = load("{upload_repo}")
response = generate(model, tokenizer, prompt="hello", verbose=True)
```
"""
    card.save(os.path.join(path, "README.md"))

    logging.set_verbosity_info()

    api = HfApi()
    api.create_repo(repo_id=upload_repo, exist_ok=True)
    api.upload_folder(
        folder_path=path,
        repo_id=upload_repo,
        repo_type="model",
    )


def convert(
    hf_path: str,
    mlx_path: str = "mlx_model",
    quantize: bool = False,
    q_group_size: int = 64,
    q_bits: int = 4,
    dtype: str = "float16",
    upload_repo: str = None,
):
    print("[INFO] Loading")
    weights, config, tokenizer = fetch_from_hub(hf_path)
    dtype = mx.float16 if quantize else getattr(mx, dtype)
    weights = {k: v.astype(dtype) for k, v in weights.items()}
    if quantize:
        print("[INFO] Quantizing")
        weights, config = quantize_model(weights, config, hf_path, q_group_size, q_bits)

    mlx_path = Path(mlx_path)
    mlx_path.mkdir(parents=True, exist_ok=True)
    shards = make_shards(weights)
    for i, shard in enumerate(shards):
        mx.save_safetensors(str(mlx_path / f"weights.{i:02d}.safetensors"), shard)
    tokenizer.save_pretrained(mlx_path)
    with open(mlx_path / "config.json", "w") as fid:
        json.dump(config, fid, indent=4)

    if upload_repo is not None:
        upload_to_hub(mlx_path, upload_repo, hf_path)


if __name__ == "__main__":
    parser = configure_parser()
    args = parser.parse_args()
    convert(**vars(args))
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`import argparse`
			`import copy`
			`import glob`
			`import json`
			`from pathlib import Path`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`from typing import Dict, Tuple`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00
			`import mlx.core as mx`
			`import mlx.nn as nn`
			`import transformers`
			`from mlx.utils import tree_flatten`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00
			`from .utils import get_model_path, load`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00
			`MAX_FILE_SIZE_GB = 15`


			`def configure_parser() -> argparse.ArgumentParser:`
			`"""`
			`Configures and returns the argument parser for the script.`

			`Returns:`
			`argparse.ArgumentParser: Configured argument parser.`
			`"""`
			`parser = argparse.ArgumentParser(`
			`description="Convert Hugging Face model to MLX format"`
			`)`

			`parser.add_argument("--hf-path", type=str, help="Path to the Hugging Face model.")`
			`parser.add_argument(`
			`"--mlx-path", type=str, default="mlx_model", help="Path to save the MLX model."`
			`)`
			`parser.add_argument(`
			`"-q", "--quantize", help="Generate a quantized model.", action="store_true"`
			`)`
			`parser.add_argument(`
			`"--q-group-size", help="Group size for quantization.", type=int, default=64`
			`)`
			`parser.add_argument(`
			`"--q-bits", help="Bits per weight for quantization.", type=int, default=4`
			`)`
			`parser.add_argument(`
			`"--dtype",`
			`help="Type to save the parameters, ignored if -q is given.",`
			`type=str,`
			`choices=["float16", "bfloat16", "float32"],`
			`default="float16",`
			`)`
			`parser.add_argument(`
			`"--upload-repo",`
			`help="The Hugging Face repo to upload the model to.",`
			`type=str,`
			`default=None,`
			`)`
			`return parser`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00

refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`def fetch_from_hub(`
			`model_path: str,`
			`) -> Tuple[Dict, dict, transformers.PreTrainedTokenizer]:`
			`model_path = get_model_path(model_path)`
Add -local flag to llms/hf_llm/convert.py for reading source HF models from filesystem. (#260) * * Add --local flag for reading models from filesystem and related code for doing so * Disable uploading to huggingface if --local flag is set * Remove code related to .bin files and merge fetch_from_local and fetch_from_hub into one function. * Update llms/hf_llm/convert.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * format / nits --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-11 11:53:01 +08:00
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`weight_files = glob.glob(f"{model_path}/*.safetensors")`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`if not weight_files:`
			`raise FileNotFoundError(f"No safetensors found in {model_path}")`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00
			`weights = {}`
			`for wf in weight_files:`
			`weights.update(mx.load(wf).items())`

fix: undefined hf_path (#292) 2024-01-11 21:53:52 +08:00			`config = transformers.AutoConfig.from_pretrained(model_path)`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)`

Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`return weights, config.to_dict(), tokenizer`


Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`def quantize_model(`
			`weights: dict, config: dict, hf_path: str, q_group_size: int, q_bits: int`
			`) -> tuple:`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`"""`
			`Applies quantization to the model weights.`

			`Args:`
			`weights (dict): Model weights.`
			`config (dict): Model configuration.`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`hf_path (str): HF model path..`
			`q_group_size (int): Group size for quantization.`
			`q_bits (int): Bits per weight for quantization.`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`Returns:`
			`tuple: Tuple containing quantized weights and config.`
			`"""`
			`quantized_config = copy.deepcopy(config)`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`model, _ = load(hf_path)`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`model.load_weights(list(weights.items()))`

Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`nn.QuantizedLinear.quantize_module(model, q_group_size, q_bits)`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`quantized_config["quantization"] = {`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`"group_size": q_group_size,`
			`"bits": q_bits,`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`}`
			`quantized_weights = dict(tree_flatten(model.parameters()))`

			`return quantized_weights, quantized_config`


refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`def make_shards(weights: dict, max_file_size_gb: int = MAX_FILE_SIZE_GB) -> list:`
			`"""`
			`Splits the weights into smaller shards.`

			`Args:`
			`weights (dict): Model weights.`
			`max_file_size_gb (int): Maximum size of each shard in gigabytes.`

			`Returns:`
			`list: List of weight shards.`
			`"""`
			`max_file_size_bytes = max_file_size_gb << 30`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`shards = []`
			`shard, shard_size = {}, 0`
			`for k, v in weights.items():`
			`estimated_size = v.size * v.dtype.size`
			`if shard_size + estimated_size > max_file_size_bytes:`
			`shards.append(shard)`
			`shard, shard_size = {}, 0`
			`shard[k] = v`
			`shard_size += estimated_size`
			`shards.append(shard)`
			`return shards`


refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`def upload_to_hub(path: str, upload_repo: str, hf_path: str):`
			`"""`
			`Uploads the model to Hugging Face hub.`

			`Args:`
			`path (str): Local path to the model.`
			`upload_repo (str): Name of the HF repo to upload to.`
			`hf_path (str): Path to the original Hugging Face model.`
			`"""`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`import os`

			`from huggingface_hub import HfApi, ModelCard, logging`

			`card = ModelCard.load(hf_path)`
			`card.data.tags = ["mlx"] if card.data.tags is None else card.data.tags + ["mlx"]`
			`card.text = f"""`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`# {upload_repo}`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			This model was converted to MLX format from [`{hf_path}`]().
			`Refer to the [original model card](https://huggingface.co/{hf_path}) for more details on the model.`
			`## Use with mlx`
Update model card uploaded with converted models (#309) 2024-01-13 05:03:52 +08:00
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			```bash
Update model card uploaded with converted models (#309) 2024-01-13 05:03:52 +08:00			`pip install mlx-lm`
			```

			```python
			`from mlx_lm import load, generate`

			`model, tokenizer = load("{upload_repo}")`
			`response = generate(model, tokenizer, prompt="hello", verbose=True)`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			```
			`"""`
			`card.save(os.path.join(path, "README.md"))`

			`logging.set_verbosity_info()`

			`api = HfApi()`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`api.create_repo(repo_id=upload_repo, exist_ok=True)`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`api.upload_folder(`
			`folder_path=path,`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`repo_id=upload_repo,`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`repo_type="model",`
			`)`


Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`def convert(`
			`hf_path: str,`
			`mlx_path: str = "mlx_model",`
			`quantize: bool = False,`
			`q_group_size: int = 64,`
			`q_bits: int = 4,`
			`dtype: str = "float16",`
			`upload_repo: str = None,`
			`):`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`print("[INFO] Loading")`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`weights, config, tokenizer = fetch_from_hub(hf_path)`
			`dtype = mx.float16 if quantize else getattr(mx, dtype)`
force fp16 for quantized models (#240) 2024-01-06 13:29:15 +08:00			`weights = {k: v.astype(dtype) for k, v in weights.items()}`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`if quantize:`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`print("[INFO] Quantizing")`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`weights, config = quantize_model(weights, config, hf_path, q_group_size, q_bits)`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`mlx_path = Path(mlx_path)`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`mlx_path.mkdir(parents=True, exist_ok=True)`
			`shards = make_shards(weights)`
			`for i, shard in enumerate(shards):`
			`mx.save_safetensors(str(mlx_path / f"weights.{i:02d}.safetensors"), shard)`
			`tokenizer.save_pretrained(mlx_path)`
			`with open(mlx_path / "config.json", "w") as fid:`
			`json.dump(config, fid, indent=4)`

Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`if upload_repo is not None:`
			`upload_to_hub(mlx_path, upload_repo, hf_path)`


			`if __name__ == "__main__":`
			`parser = configure_parser()`
			`args = parser.parse_args()`
			`convert(**vars(args))`