mlx-examples/llms/mlx_lm/tuner/utils.py

import os
from typing import Dict

import mlx.core as mx
import mlx.nn as nn
from mlx.utils import tree_unflatten

from .lora import LoRALinear


def linear_to_lora_layers(
    model: nn.Module,
    num_lora_layers: int,
    config: Dict,
):
    """
    Convert some of the models linear layers to lora layers.

    Args:
        model (nn.Module): The neural network model.
        num_lora_layers (int): The number of blocks to convert to lora layers
        starting from the last layer.
        config (dict): More configuration parameters for LoRA, including the
          rank, alpha, scale, and optional layer keys.
    """

    num_layers = len(model.layers)
    if num_lora_layers > num_layers:
        raise ValueError(
            f"Requested {num_lora_layers} LoRA layers "
            f"but the model only has {num_layers} layers."
        )

    to_lora = lambda lin: LoRALinear.from_linear(
        lin,
        r=config["rank"],
        alpha=config["alpha"],
        scale=config["scale"],
        dropout=config["dropout"],
    )

    keys = config.get("keys", None)
    if keys is not None:
        keys = set(keys)
    elif model.model_type in [
        "mistral",
        "llama",
        "phi",
        "mixtral",
        "stablelm",
        "qwen2",
        "gemma",
        "starcoder2",
        "cohere",
    ]:
        keys = set(["self_attn.q_proj", "self_attn.v_proj"])
        if model.model_type == "mixtral":
            keys.add("block_sparse_moe.gate")
    elif model.model_type == "olmo":
        keys = set(["att_proj"])
    elif model.model_type == "phi-msft":
        keys = set(["mixer.Wqkv", "moe.gate"])
    elif model.model_type == "dbrx":
        keys = set(["norm_attn_norm.attn.Wqkv", "ffn.router.layer"])
    else:
        raise ValueError(f"Lora does not support {model.model_type}")

    for l in model.layers[num_layers - num_lora_layers :]:
        modules = l.named_modules()
        lora_layers = [(k, to_lora(m)) for k, m in l.named_modules() if k in keys]
        l.update_modules(tree_unflatten(lora_layers))


def apply_lora_layers(model: nn.Module, adapter_file: str) -> nn.Module:
    """
    Apply LoRA layers to the model.

    Args:
        model (nn.Module): The neural network model.
        adapter_file (str): Path to the adapter configuration file.

    Returns:
        nn.Module: The updated model with LoRA layers applied.
    """
    if not os.path.exists(adapter_file):
        raise FileNotFoundError(f"The adapter file does not exist: {adapter_file}")

    adapters = list(mx.load(adapter_file).items())

    linear_replacements = []
    lora_layers = set(
        [name.replace(".lora_a", "").replace(".lora_b", "") for name, _ in adapters]
    )
    for name, module in model.named_modules():
        if name in lora_layers:
            replacement_module = LoRALinear.from_linear(module)
            linear_replacements.append((name, replacement_module))

    model.update_modules(tree_unflatten(linear_replacements))

    model.update(tree_unflatten(adapters))

    return model


def dequantize(model: nn.Module) -> nn.Module:
    """
    Dequantize the quantized linear layers in the model.

    Args:
        model (nn.Module): The model with quantized linear layers.

    Returns:
        nn.Module: The model with dequantized layers.
    """
    de_quantize_layers = []
    for name, module in model.named_modules():
        if isinstance(module, nn.QuantizedLinear):
            bias = "bias" in module
            weight = module.weight
            weight = mx.dequantize(
                weight,
                module.scales,
                module.biases,
                module.group_size,
                module.bits,
            ).astype(mx.float16)
            output_dims, input_dims = weight.shape
            linear = nn.Linear(input_dims, output_dims, bias=bias)
            linear.weight = weight
            if bias:
                linear.bias = module.bias
            de_quantize_layers.append((name, linear))
    if len(de_quantize_layers) > 0:
        model.update_modules(tree_unflatten(de_quantize_layers))
    return model


def remove_lora_layers(model: nn.Module) -> nn.Module:
    """
    Remove the LoRA layers from the model.

    Args:
        model (nn.Module): The model with LoRA layers.

    Returns:
        nn.Module: The model without LoRA layers.
    """
    reset_layers = []
    for name, module in model.named_modules():
        if isinstance(module, LoRALinear):
            reset_layers.append((name, module.linear))
    if len(reset_layers) > 0:
        model.update_modules(tree_unflatten(reset_layers))
    return model
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`import os`
LoRA on all linear transformer block layers (#546) * Add --lora-all-linear option to apply LoRa to all linear transfer block layers * Moved to YAML config and added specification of rank & alpha * nits in conifg, more tests * nit * run tests for prs --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-12 22:37:40 +08:00			`from typing import Dict`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`import mlx.core as mx`
chore(mlx-lm): add load model with adapter and fix bug in sample (#360) * chore: add load model with adapter support and fix bug in sample * chore: ignore temp during calculating prob in sample 2024-01-24 11:47:39 +08:00			`import mlx.nn as nn`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`from mlx.utils import tree_unflatten`

			`from .lora import LoRALinear`


LoRA on all linear transformer block layers (#546) * Add --lora-all-linear option to apply LoRa to all linear transfer block layers * Moved to YAML config and added specification of rank & alpha * nits in conifg, more tests * nit * run tests for prs --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-12 22:37:40 +08:00			`def linear_to_lora_layers(`
			`model: nn.Module,`
			`num_lora_layers: int,`
			`config: Dict,`
			`):`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`"""`
			`Convert some of the models linear layers to lora layers.`

			`Args:`
			`model (nn.Module): The neural network model.`
			`num_lora_layers (int): The number of blocks to convert to lora layers`
			`starting from the last layer.`
LoRA on all linear transformer block layers (#546) * Add --lora-all-linear option to apply LoRa to all linear transfer block layers * Moved to YAML config and added specification of rank & alpha * nits in conifg, more tests * nit * run tests for prs --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-12 22:37:40 +08:00			`config (dict): More configuration parameters for LoRA, including the`
			`rank, alpha, scale, and optional layer keys.`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`"""`
LoRA: Improve validation error for LoRA layer count exceeding model layer (#427) * LoRA: Improve validation error for LoRA layer count exceeding model layer This commit enhances the error handling when the specified LoRA layer count exceeds the total number of layers in the model. It clarifies the error message to provide actionable feedback for users, guiding them to adjust their input parameters accordingly. * format + nits --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-02-13 22:56:27 +08:00
LoRA on all linear transformer block layers (#546) * Add --lora-all-linear option to apply LoRa to all linear transfer block layers * Moved to YAML config and added specification of rank & alpha * nits in conifg, more tests * nit * run tests for prs --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-12 22:37:40 +08:00			`num_layers = len(model.layers)`
			`if num_lora_layers > num_layers:`
			`raise ValueError(`
			`f"Requested {num_lora_layers} LoRA layers "`
			`f"but the model only has {num_layers} layers."`
			`)`
LoRA: Improve validation error for LoRA layer count exceeding model layer (#427) * LoRA: Improve validation error for LoRA layer count exceeding model layer This commit enhances the error handling when the specified LoRA layer count exceeds the total number of layers in the model. It clarifies the error message to provide actionable feedback for users, guiding them to adjust their input parameters accordingly. * format + nits --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-02-13 22:56:27 +08:00
LoRA on all linear transformer block layers (#546) * Add --lora-all-linear option to apply LoRa to all linear transfer block layers * Moved to YAML config and added specification of rank & alpha * nits in conifg, more tests * nit * run tests for prs --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-12 22:37:40 +08:00			`to_lora = lambda lin: LoRALinear.from_linear(`
Add dropout parameter to lora configuration (#599) * Add dropout parameter to lora configuration A dropout parameter has been added to the lora configuration settings in lora_config.yaml. The LoRALinear class in utils.py has been updated to take this new parameter. Additionally, a AttributeError: 'types.SimpleNamespace' object has no attribute 'prompt' related to `args.prompt` has been removed from lora.py. * Update lora_config.yaml Set dropout to 0.0 in the sample config file * format --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-20 23:44:40 +08:00			`lin,`
			`r=config["rank"],`
			`alpha=config["alpha"],`
			`scale=config["scale"],`
			`dropout=config["dropout"],`
LoRA on all linear transformer block layers (#546) * Add --lora-all-linear option to apply LoRa to all linear transfer block layers * Moved to YAML config and added specification of rank & alpha * nits in conifg, more tests * nit * run tests for prs --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-12 22:37:40 +08:00			`)`

			`keys = config.get("keys", None)`
			`if keys is not None:`
			`keys = set(keys)`
			`elif model.model_type in [`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`"mistral",`
			`"llama",`
			`"phi",`
			`"mixtral",`
Update to StableLM code (#514) * StableLM now part of Transformers as stablelm rather than stablelm_epoch; changed config to match new changes * removing old file * reference new stablelm 2024-03-02 01:53:38 +08:00			`"stablelm",`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`"qwen2",`
Gemma support (#474) * gemma support * format * lora support for gemma 2024-02-22 00:47:13 +08:00			`"gemma",`
Add Starcoder 2 (#502) * Add Starcoder2 model and update utils.py * Refactor model arguments and modules in starcoder2.py * Refactor FeedForward class to MLP in starcoder2.py * Fix typo * pre-commit * Refactor starcoder2.py: Update model arguments and modules * Fix LM head and MLP layers * Rename input layer norm * Update bias in linear layers * Refactor token embeddings in Starcoder2Model * Rename to standard HF attention layer name * Add LayerNorm * Add transposed token embeddings (like in Gemma) * Refactor MLP and TransformerBlock classes * Add tie_word_embeddings option to ModelArgs and update Model implementation * Add conditional check for tying word embeddings in Starcoder2Model * Fix bias in lm_head linear layer * Remove unused LayerNorm in stablelm * Update transformers dependency to use GitHub repository * fix lm head bug, revert transformer req * Update RoPE initialization in Attention class --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-03 11:39:23 +08:00			`"starcoder2",`
version (#570) 2024-03-14 01:09:36 +08:00			`"cohere",`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`]:`
LoRA on all linear transformer block layers (#546) * Add --lora-all-linear option to apply LoRa to all linear transfer block layers * Moved to YAML config and added specification of rank & alpha * nits in conifg, more tests * nit * run tests for prs --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-12 22:37:40 +08:00			`keys = set(["self_attn.q_proj", "self_attn.v_proj"])`
			`if model.model_type == "mixtral":`
Some improvements to LoRA (#528) * set cache_limit * remove set cache_limit * cleanup * add gradient checkpointing * fix sort * mokey patch call for checkpoint * fix example config 2024-03-13 11:02:03 +08:00			`keys.add("block_sparse_moe.gate")`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`elif model.model_type == "olmo":`
LoRA on all linear transformer block layers (#546) * Add --lora-all-linear option to apply LoRa to all linear transfer block layers * Moved to YAML config and added specification of rank & alpha * nits in conifg, more tests * nit * run tests for prs --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-12 22:37:40 +08:00			`keys = set(["att_proj"])`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`elif model.model_type == "phi-msft":`
LoRA on all linear transformer block layers (#546) * Add --lora-all-linear option to apply LoRa to all linear transfer block layers * Moved to YAML config and added specification of rank & alpha * nits in conifg, more tests * nit * run tests for prs --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-12 22:37:40 +08:00			`keys = set(["mixer.Wqkv", "moe.gate"])`
DBRX (#628) * dbrx * format * format * comments * change scores slightly * remove inadvertant import 2024-03-29 12:03:53 +08:00			`elif model.model_type == "dbrx":`
			`keys = set(["norm_attn_norm.attn.Wqkv", "ffn.router.layer"])`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`else:`
			`raise ValueError(f"Lora does not support {model.model_type}")`

LoRA on all linear transformer block layers (#546) * Add --lora-all-linear option to apply LoRa to all linear transfer block layers * Moved to YAML config and added specification of rank & alpha * nits in conifg, more tests * nit * run tests for prs --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-12 22:37:40 +08:00			`for l in model.layers[num_layers - num_lora_layers :]:`
			`modules = l.named_modules()`
			`lora_layers = [(k, to_lora(m)) for k, m in l.named_modules() if k in keys]`
			`l.update_modules(tree_unflatten(lora_layers))`

Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00
chore(mlx-lm): add load model with adapter and fix bug in sample (#360) * chore: add load model with adapter support and fix bug in sample * chore: ignore temp during calculating prob in sample 2024-01-24 11:47:39 +08:00			`def apply_lora_layers(model: nn.Module, adapter_file: str) -> nn.Module:`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`"""`
			`Apply LoRA layers to the model.`

			`Args:`
			`model (nn.Module): The neural network model.`
			`adapter_file (str): Path to the adapter configuration file.`

			`Returns:`
			`nn.Module: The updated model with LoRA layers applied.`
			`"""`
			`if not os.path.exists(adapter_file):`
			`raise FileNotFoundError(f"The adapter file does not exist: {adapter_file}")`

feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`adapters = list(mx.load(adapter_file).items())`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00
			`linear_replacements = []`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`lora_layers = set(`
			`[name.replace(".lora_a", "").replace(".lora_b", "") for name, _ in adapters]`
			`)`
			`for name, module in model.named_modules():`
			`if name in lora_layers:`
			`replacement_module = LoRALinear.from_linear(module)`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`linear_replacements.append((name, replacement_module))`

			`model.update_modules(tree_unflatten(linear_replacements))`
fix(mlx-lm): apply lora layer doesn't update the lora weights (#396) 2024-02-01 03:51:26 +08:00
			`model.update(tree_unflatten(adapters))`

feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`return model`


			`def dequantize(model: nn.Module) -> nn.Module:`
			`"""`
			`Dequantize the quantized linear layers in the model.`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`Args:`
			`model (nn.Module): The model with quantized linear layers.`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`Returns:`
			`nn.Module: The model with dequantized layers.`
			`"""`
			`de_quantize_layers = []`
chore(mlx-lm): add reset lora layers helper (#377) * chore(mlx-lm): add reset lora layers helper * chore: rename the func * chore: update docstring * Update llms/mlx_lm/tuner/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-30 12:54:49 +08:00			`for name, module in model.named_modules():`
			`if isinstance(module, nn.QuantizedLinear):`
			`bias = "bias" in module`
			`weight = module.weight`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`weight = mx.dequantize(`
			`weight,`
chore(mlx-lm): add reset lora layers helper (#377) * chore(mlx-lm): add reset lora layers helper * chore: rename the func * chore: update docstring * Update llms/mlx_lm/tuner/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-30 12:54:49 +08:00			`module.scales,`
			`module.biases,`
			`module.group_size,`
			`module.bits,`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`).astype(mx.float16)`
			`output_dims, input_dims = weight.shape`
			`linear = nn.Linear(input_dims, output_dims, bias=bias)`
			`linear.weight = weight`
			`if bias:`
chore(mlx-lm): add reset lora layers helper (#377) * chore(mlx-lm): add reset lora layers helper * chore: rename the func * chore: update docstring * Update llms/mlx_lm/tuner/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-30 12:54:49 +08:00			`linear.bias = module.bias`
			`de_quantize_layers.append((name, linear))`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`if len(de_quantize_layers) > 0:`
			`model.update_modules(tree_unflatten(de_quantize_layers))`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`return model`
chore(mlx-lm): add reset lora layers helper (#377) * chore(mlx-lm): add reset lora layers helper * chore: rename the func * chore: update docstring * Update llms/mlx_lm/tuner/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-30 12:54:49 +08:00

			`def remove_lora_layers(model: nn.Module) -> nn.Module:`
			`"""`
			`Remove the LoRA layers from the model.`

			`Args:`
			`model (nn.Module): The model with LoRA layers.`

			`Returns:`
			`nn.Module: The model without LoRA layers.`
			`"""`
			`reset_layers = []`
			`for name, module in model.named_modules():`
			`if isinstance(module, LoRALinear):`
			`reset_layers.append((name, module.linear))`
			`if len(reset_layers) > 0:`
			`model.update_modules(tree_unflatten(reset_layers))`
			`return model`