mlx-examples/llms/mlx_lm/fuse.py

import argparse
import glob
import shutil
from pathlib import Path

from mlx.utils import tree_flatten, tree_unflatten

from .gguf import convert_to_gguf
from .tuner.lora import LoRALinear
from .tuner.utils import apply_lora_layers, dequantize
from .utils import (
    fetch_from_hub,
    get_model_path,
    save_config,
    save_weights,
    upload_to_hub,
)


def parse_arguments() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="LoRA or QLoRA finetuning.")
    parser.add_argument(
        "--model",
        default="mlx_model",
        help="The path to the local model directory or Hugging Face repo.",
    )
    parser.add_argument(
        "--save-path",
        default="lora_fused_model",
        help="The path to save the fused model.",
    )
    parser.add_argument(
        "--adapter-path",
        type=str,
        default="adapters",
        help="Path to the trained adapter weights and config.",
    )
    parser.add_argument(
        "--hf-path",
        type=str,
        default=None,
        help="Path to the original Hugging Face model. Required for upload if --model is a local directory.",
    )
    parser.add_argument(
        "--upload-repo",
        help="The Hugging Face repo to upload the model to.",
        type=str,
        default=None,
    )
    parser.add_argument(
        "--de-quantize",
        help="Generate a de-quantized model.",
        action="store_true",
    )
    parser.add_argument(
        "--export-gguf",
        help="Export model weights in GGUF format.",
        action="store_true",
    )
    parser.add_argument(
        "--gguf-path",
        help="Path to save the exported GGUF format model weights. Default is ggml-model-f16.gguf.",
        default="ggml-model-f16.gguf",
        type=str,
    )
    return parser.parse_args()


def main() -> None:
    print("Loading pretrained model")
    args = parse_arguments()

    model_path = get_model_path(args.model)
    model, config, tokenizer = fetch_from_hub(model_path)

    model.freeze()
    model = apply_lora_layers(model, args.adapter_path)

    fused_linears = [
        (n, m.to_linear())
        for n, m in model.named_modules()
        if isinstance(m, LoRALinear)
    ]

    model.update_modules(tree_unflatten(fused_linears))

    if args.de_quantize:
        print("De-quantizing model")
        model = dequantize(model)

    weights = dict(tree_flatten(model.parameters()))

    save_path = Path(args.save_path)

    save_weights(save_path, weights)

    py_files = glob.glob(str(model_path / "*.py"))
    for file in py_files:
        shutil.copy(file, save_path)

    tokenizer.save_pretrained(save_path)

    if args.de_quantize:
        config.pop("quantization", None)

    save_config(config, config_path=save_path / "config.json")

    if args.export_gguf:
        model_type = config["model_type"]
        if model_type not in ["llama", "mixtral", "mistral"]:
            raise ValueError(
                f"Model type {model_type} not supported for GGUF conversion."
            )
        convert_to_gguf(model_path, weights, config, str(save_path / args.gguf_path))

    if args.upload_repo is not None:
        hf_path = args.hf_path or (
            args.model if not Path(args.model).exists() else None
        )
        if hf_path is None:
            raise ValueError(
                "Must provide original Hugging Face repo to upload local model."
            )
        upload_to_hub(args.save_path, args.upload_repo, hf_path)


if __name__ == "__main__":
    main()
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`import argparse`
			`import glob`
			`import shutil`
			`from pathlib import Path`

			`from mlx.utils import tree_flatten, tree_unflatten`

feat(mlx-lm): export the GGUF (fp16) format model weights from fuse.py (#555) * wip * wip * feat: convert mlx model to gguf f16 * chore: conver norm layer to float32 to avoid overflow issue * chore: add support for mixtral * chore: clean up * chore: remove unused import statement * chore: clean up weight name mapping * version and readme * actual version bump --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-22 01:34:11 +08:00			`from .gguf import convert_to_gguf`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`from .tuner.lora import LoRALinear`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`from .tuner.utils import apply_lora_layers, dequantize`
feat: add update_config functionality (#531) * feat: add `update_config` finctionality - sorts the config for better readability - updates "_name_or_path" key in config with upload_repo - sets indentation of 4 spaces - allows adding other key-value pairs via kwargs - reduces code duplication - standardizes config-update across mlx-lm * feat: standardize updating config Impactes: - fuse.py - merge.py * update formatting * remove commented out code * update func: update_config to save_config - drop kwards - rename func as save_config - incorporate review suggestions * update func: save_config - ensure only config-saving functionality - function oes not return config as a dict anymore - added review suggestions * fixed formatting * update formatting instruction in contribution guide * nits --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-14 21:36:05 +08:00			`from .utils import (`
			`fetch_from_hub,`
			`get_model_path,`
			`save_config,`
			`save_weights,`
			`upload_to_hub,`
			`)`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00

			`def parse_arguments() -> argparse.Namespace:`
			`parser = argparse.ArgumentParser(description="LoRA or QLoRA finetuning.")`
			`parser.add_argument(`
			`"--model",`
			`default="mlx_model",`
			`help="The path to the local model directory or Hugging Face repo.",`
			`)`
			`parser.add_argument(`
			`"--save-path",`
			`default="lora_fused_model",`
			`help="The path to save the fused model.",`
			`)`
			`parser.add_argument(`
Save lora config (#636) * lora config * comments * version bump 2024-04-03 04:52:53 +08:00			`"--adapter-path",`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`type=str,`
Save lora config (#636) * lora config * comments * version bump 2024-04-03 04:52:53 +08:00			`default="adapters",`
			`help="Path to the trained adapter weights and config.",`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`)`
			`parser.add_argument(`
			`"--hf-path",`
			`type=str,`
			`default=None,`
			`help="Path to the original Hugging Face model. Required for upload if --model is a local directory.",`
			`)`
			`parser.add_argument(`
			`"--upload-repo",`
			`help="The Hugging Face repo to upload the model to.",`
			`type=str,`
			`default=None,`
			`)`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`parser.add_argument(`
			`"--de-quantize",`
			`help="Generate a de-quantized model.",`
			`action="store_true",`
			`)`
feat(mlx-lm): export the GGUF (fp16) format model weights from fuse.py (#555) * wip * wip * feat: convert mlx model to gguf f16 * chore: conver norm layer to float32 to avoid overflow issue * chore: add support for mixtral * chore: clean up * chore: remove unused import statement * chore: clean up weight name mapping * version and readme * actual version bump --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-22 01:34:11 +08:00			`parser.add_argument(`
			`"--export-gguf",`
			`help="Export model weights in GGUF format.",`
			`action="store_true",`
			`)`
			`parser.add_argument(`
			`"--gguf-path",`
			`help="Path to save the exported GGUF format model weights. Default is ggml-model-f16.gguf.",`
			`default="ggml-model-f16.gguf",`
			`type=str,`
			`)`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`return parser.parse_args()`


			`def main() -> None:`
			`print("Loading pretrained model")`
			`args = parse_arguments()`

			`model_path = get_model_path(args.model)`
			`model, config, tokenizer = fetch_from_hub(model_path)`

			`model.freeze()`
Save lora config (#636) * lora config * comments * version bump 2024-04-03 04:52:53 +08:00			`model = apply_lora_layers(model, args.adapter_path)`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`fused_linears = [`
			`(n, m.to_linear())`
			`for n, m in model.named_modules()`
			`if isinstance(m, LoRALinear)`
			`]`

			`model.update_modules(tree_unflatten(fused_linears))`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00
			`if args.de_quantize:`
			`print("De-quantizing model")`
			`model = dequantize(model)`

feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`weights = dict(tree_flatten(model.parameters()))`

			`save_path = Path(args.save_path)`

			`save_weights(save_path, weights)`

			`py_files = glob.glob(str(model_path / "*.py"))`
			`for file in py_files:`
			`shutil.copy(file, save_path)`

			`tokenizer.save_pretrained(save_path)`

feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`if args.de_quantize:`
			`config.pop("quantization", None)`

feat: add update_config functionality (#531) * feat: add `update_config` finctionality - sorts the config for better readability - updates "_name_or_path" key in config with upload_repo - sets indentation of 4 spaces - allows adding other key-value pairs via kwargs - reduces code duplication - standardizes config-update across mlx-lm * feat: standardize updating config Impactes: - fuse.py - merge.py * update formatting * remove commented out code * update func: update_config to save_config - drop kwards - rename func as save_config - incorporate review suggestions * update func: save_config - ensure only config-saving functionality - function oes not return config as a dict anymore - added review suggestions * fixed formatting * update formatting instruction in contribution guide * nits --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-14 21:36:05 +08:00			`save_config(config, config_path=save_path / "config.json")`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00
feat(mlx-lm): export the GGUF (fp16) format model weights from fuse.py (#555) * wip * wip * feat: convert mlx model to gguf f16 * chore: conver norm layer to float32 to avoid overflow issue * chore: add support for mixtral * chore: clean up * chore: remove unused import statement * chore: clean up weight name mapping * version and readme * actual version bump --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-22 01:34:11 +08:00			`if args.export_gguf:`
			`model_type = config["model_type"]`
			`if model_type not in ["llama", "mixtral", "mistral"]:`
			`raise ValueError(`
			`f"Model type {model_type} not supported for GGUF conversion."`
			`)`
			`convert_to_gguf(model_path, weights, config, str(save_path / args.gguf_path))`

feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`if args.upload_repo is not None:`
			`hf_path = args.hf_path or (`
			`args.model if not Path(args.model).exists() else None`
			`)`
			`if hf_path is None:`
			`raise ValueError(`
			`"Must provide original Hugging Face repo to upload local model."`
			`)`
			`upload_to_hub(args.save_path, args.upload_repo, hf_path)`


			`if __name__ == "__main__":`
			`main()`