mlx-examples/llms/mlx_lm/tuner/utils.py

import os

import mlx.core as mx
import mlx.nn as nn
from mlx.utils import tree_unflatten

from .lora import LoRALinear


def linear_to_lora_layers(model: nn.Module, num_lora_layers: int):
    """
    Convert some of the models linear layers to lora layers.

    Args:
        model (nn.Module): The neural network model.
        num_lora_layers (int): The number of blocks to convert to lora layers
        starting from the last layer.
    """

    def check_lora_layers(num_model):
        if num_lora_layers > num_model:
            raise ValueError(
                f"Requested {num_lora_layers} LoRA layers "
                f"but the model only has {num_model} layers."
            )

    if model.model_type in [
        "mistral",
        "llama",
        "phi",
        "mixtral",
        "stablelm",
        "qwen2",
        "gemma",
        "starcoder2",
    ]:
        check_lora_layers(len(model.model.layers))

        for l in model.model.layers[len(model.model.layers) - num_lora_layers :]:
            l.self_attn.q_proj = LoRALinear.from_linear(l.self_attn.q_proj)
            l.self_attn.v_proj = LoRALinear.from_linear(l.self_attn.v_proj)
            if hasattr(l, "block_sparse_moe"):
                l.block_sparse_moe.gate = LoRALinear.from_linear(
                    l.block_sparse_moe.gate
                )
    elif model.model_type == "olmo":
        check_lora_layers(len(model.model.transformer.blocks))

        for l in model.model.transformer.blocks[
            len(model.model.transformer.blocks) - num_lora_layers :
        ]:
            l.att_proj = LoRALinear.from_linear(l.att_proj)
    elif model.model_type == "phi-msft":
        check_lora_layers(len(model.transformer.h))

        for l in model.transformer.h[len(model.transformer.h) - num_lora_layers :]:
            l.mixer.Wqkv = LoRALinear.from_linear(l.mixer.Wqkv)
            l.moe.gate = LoRALinear.from_linear(l.moe.gate)

    else:
        raise ValueError(f"Lora does not support {model.model_type}")


def apply_lora_layers(model: nn.Module, adapter_file: str) -> nn.Module:
    """
    Apply LoRA layers to the model.

    Args:
        model (nn.Module): The neural network model.
        adapter_file (str): Path to the adapter configuration file.

    Returns:
        nn.Module: The updated model with LoRA layers applied.
    """
    if not os.path.exists(adapter_file):
        raise FileNotFoundError(f"The adapter file does not exist: {adapter_file}")

    adapters = list(mx.load(adapter_file).items())

    linear_replacements = []
    lora_layers = set(
        [name.replace(".lora_a", "").replace(".lora_b", "") for name, _ in adapters]
    )
    for name, module in model.named_modules():
        if name in lora_layers:
            replacement_module = LoRALinear.from_linear(module)
            linear_replacements.append((name, replacement_module))

    model.update_modules(tree_unflatten(linear_replacements))

    model.update(tree_unflatten(adapters))

    return model


def dequantize(model: nn.Module) -> nn.Module:
    """
    Dequantize the quantized linear layers in the model.

    Args:
        model (nn.Module): The model with quantized linear layers.

    Returns:
        nn.Module: The model with dequantized layers.
    """
    de_quantize_layers = []
    for name, module in model.named_modules():
        if isinstance(module, nn.QuantizedLinear):
            bias = "bias" in module
            weight = module.weight
            weight = mx.dequantize(
                weight,
                module.scales,
                module.biases,
                module.group_size,
                module.bits,
            ).astype(mx.float16)
            output_dims, input_dims = weight.shape
            linear = nn.Linear(input_dims, output_dims, bias=bias)
            linear.weight = weight
            if bias:
                linear.bias = module.bias
            de_quantize_layers.append((name, linear))
    if len(de_quantize_layers) > 0:
        model.update_modules(tree_unflatten(de_quantize_layers))
    return model


def remove_lora_layers(model: nn.Module) -> nn.Module:
    """
    Remove the LoRA layers from the model.

    Args:
        model (nn.Module): The model with LoRA layers.

    Returns:
        nn.Module: The model without LoRA layers.
    """
    reset_layers = []
    for name, module in model.named_modules():
        if isinstance(module, LoRALinear):
            reset_layers.append((name, module.linear))
    if len(reset_layers) > 0:
        model.update_modules(tree_unflatten(reset_layers))
    return model
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`import os`

feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`import mlx.core as mx`
chore(mlx-lm): add load model with adapter and fix bug in sample (#360) * chore: add load model with adapter support and fix bug in sample * chore: ignore temp during calculating prob in sample 2024-01-24 11:47:39 +08:00			`import mlx.nn as nn`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`from mlx.utils import tree_unflatten`

			`from .lora import LoRALinear`


Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`def linear_to_lora_layers(model: nn.Module, num_lora_layers: int):`
			`"""`
			`Convert some of the models linear layers to lora layers.`

			`Args:`
			`model (nn.Module): The neural network model.`
			`num_lora_layers (int): The number of blocks to convert to lora layers`
			`starting from the last layer.`
			`"""`
LoRA: Improve validation error for LoRA layer count exceeding model layer (#427) * LoRA: Improve validation error for LoRA layer count exceeding model layer This commit enhances the error handling when the specified LoRA layer count exceeds the total number of layers in the model. It clarifies the error message to provide actionable feedback for users, guiding them to adjust their input parameters accordingly. * format + nits --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-02-13 22:56:27 +08:00
			`def check_lora_layers(num_model):`
			`if num_lora_layers > num_model:`
			`raise ValueError(`
			`f"Requested {num_lora_layers} LoRA layers "`
fix: check LoRA layers number error (#446) 2024-02-16 22:03:33 +08:00			`f"but the model only has {num_model} layers."`
LoRA: Improve validation error for LoRA layer count exceeding model layer (#427) * LoRA: Improve validation error for LoRA layer count exceeding model layer This commit enhances the error handling when the specified LoRA layer count exceeds the total number of layers in the model. It clarifies the error message to provide actionable feedback for users, guiding them to adjust their input parameters accordingly. * format + nits --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-02-13 22:56:27 +08:00			`)`

Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`if model.model_type in [`
			`"mistral",`
			`"llama",`
			`"phi",`
			`"mixtral",`
Update to StableLM code (#514) * StableLM now part of Transformers as stablelm rather than stablelm_epoch; changed config to match new changes * removing old file * reference new stablelm 2024-03-02 01:53:38 +08:00			`"stablelm",`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`"qwen2",`
Gemma support (#474) * gemma support * format * lora support for gemma 2024-02-22 00:47:13 +08:00			`"gemma",`
Add Starcoder 2 (#502) * Add Starcoder2 model and update utils.py * Refactor model arguments and modules in starcoder2.py * Refactor FeedForward class to MLP in starcoder2.py * Fix typo * pre-commit * Refactor starcoder2.py: Update model arguments and modules * Fix LM head and MLP layers * Rename input layer norm * Update bias in linear layers * Refactor token embeddings in Starcoder2Model * Rename to standard HF attention layer name * Add LayerNorm * Add transposed token embeddings (like in Gemma) * Refactor MLP and TransformerBlock classes * Add tie_word_embeddings option to ModelArgs and update Model implementation * Add conditional check for tying word embeddings in Starcoder2Model * Fix bias in lm_head linear layer * Remove unused LayerNorm in stablelm * Update transformers dependency to use GitHub repository * fix lm head bug, revert transformer req * Update RoPE initialization in Attention class --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-03 11:39:23 +08:00			`"starcoder2",`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`]:`
LoRA: Improve validation error for LoRA layer count exceeding model layer (#427) * LoRA: Improve validation error for LoRA layer count exceeding model layer This commit enhances the error handling when the specified LoRA layer count exceeds the total number of layers in the model. It clarifies the error message to provide actionable feedback for users, guiding them to adjust their input parameters accordingly. * format + nits --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-02-13 22:56:27 +08:00			`check_lora_layers(len(model.model.layers))`

Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`for l in model.model.layers[len(model.model.layers) - num_lora_layers :]:`
			`l.self_attn.q_proj = LoRALinear.from_linear(l.self_attn.q_proj)`
			`l.self_attn.v_proj = LoRALinear.from_linear(l.self_attn.v_proj)`
			`if hasattr(l, "block_sparse_moe"):`
			`l.block_sparse_moe.gate = LoRALinear.from_linear(`
			`l.block_sparse_moe.gate`
			`)`
			`elif model.model_type == "olmo":`
LoRA: Improve validation error for LoRA layer count exceeding model layer (#427) * LoRA: Improve validation error for LoRA layer count exceeding model layer This commit enhances the error handling when the specified LoRA layer count exceeds the total number of layers in the model. It clarifies the error message to provide actionable feedback for users, guiding them to adjust their input parameters accordingly. * format + nits --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-02-13 22:56:27 +08:00			`check_lora_layers(len(model.model.transformer.blocks))`

Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`for l in model.model.transformer.blocks[`
			`len(model.model.transformer.blocks) - num_lora_layers :`
			`]:`
			`l.att_proj = LoRALinear.from_linear(l.att_proj)`
			`elif model.model_type == "phi-msft":`
LoRA: Improve validation error for LoRA layer count exceeding model layer (#427) * LoRA: Improve validation error for LoRA layer count exceeding model layer This commit enhances the error handling when the specified LoRA layer count exceeds the total number of layers in the model. It clarifies the error message to provide actionable feedback for users, guiding them to adjust their input parameters accordingly. * format + nits --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-02-13 22:56:27 +08:00			`check_lora_layers(len(model.transformer.h))`

Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`for l in model.transformer.h[len(model.transformer.h) - num_lora_layers :]:`
			`l.mixer.Wqkv = LoRALinear.from_linear(l.mixer.Wqkv)`
			`l.moe.gate = LoRALinear.from_linear(l.moe.gate)`

			`else:`
			`raise ValueError(f"Lora does not support {model.model_type}")`


chore(mlx-lm): add load model with adapter and fix bug in sample (#360) * chore: add load model with adapter support and fix bug in sample * chore: ignore temp during calculating prob in sample 2024-01-24 11:47:39 +08:00			`def apply_lora_layers(model: nn.Module, adapter_file: str) -> nn.Module:`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`"""`
			`Apply LoRA layers to the model.`

			`Args:`
			`model (nn.Module): The neural network model.`
			`adapter_file (str): Path to the adapter configuration file.`

			`Returns:`
			`nn.Module: The updated model with LoRA layers applied.`
			`"""`
			`if not os.path.exists(adapter_file):`
			`raise FileNotFoundError(f"The adapter file does not exist: {adapter_file}")`

feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`adapters = list(mx.load(adapter_file).items())`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00
			`linear_replacements = []`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`lora_layers = set(`
			`[name.replace(".lora_a", "").replace(".lora_b", "") for name, _ in adapters]`
			`)`
			`for name, module in model.named_modules():`
			`if name in lora_layers:`
			`replacement_module = LoRALinear.from_linear(module)`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`linear_replacements.append((name, replacement_module))`

			`model.update_modules(tree_unflatten(linear_replacements))`
fix(mlx-lm): apply lora layer doesn't update the lora weights (#396) 2024-02-01 03:51:26 +08:00
			`model.update(tree_unflatten(adapters))`

feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`return model`


			`def dequantize(model: nn.Module) -> nn.Module:`
			`"""`
			`Dequantize the quantized linear layers in the model.`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`Args:`
			`model (nn.Module): The model with quantized linear layers.`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`Returns:`
			`nn.Module: The model with dequantized layers.`
			`"""`
			`de_quantize_layers = []`
chore(mlx-lm): add reset lora layers helper (#377) * chore(mlx-lm): add reset lora layers helper * chore: rename the func * chore: update docstring * Update llms/mlx_lm/tuner/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-30 12:54:49 +08:00			`for name, module in model.named_modules():`
			`if isinstance(module, nn.QuantizedLinear):`
			`bias = "bias" in module`
			`weight = module.weight`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`weight = mx.dequantize(`
			`weight,`
chore(mlx-lm): add reset lora layers helper (#377) * chore(mlx-lm): add reset lora layers helper * chore: rename the func * chore: update docstring * Update llms/mlx_lm/tuner/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-30 12:54:49 +08:00			`module.scales,`
			`module.biases,`
			`module.group_size,`
			`module.bits,`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`).astype(mx.float16)`
			`output_dims, input_dims = weight.shape`
			`linear = nn.Linear(input_dims, output_dims, bias=bias)`
			`linear.weight = weight`
			`if bias:`
chore(mlx-lm): add reset lora layers helper (#377) * chore(mlx-lm): add reset lora layers helper * chore: rename the func * chore: update docstring * Update llms/mlx_lm/tuner/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-30 12:54:49 +08:00			`linear.bias = module.bias`
			`de_quantize_layers.append((name, linear))`
feat(mlx-lm): add de-quant for fuse.py (#365) * feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found 2024-01-26 10:59:32 +08:00			`if len(de_quantize_layers) > 0:`
			`model.update_modules(tree_unflatten(de_quantize_layers))`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`return model`
chore(mlx-lm): add reset lora layers helper (#377) * chore(mlx-lm): add reset lora layers helper * chore: rename the func * chore: update docstring * Update llms/mlx_lm/tuner/utils.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> 2024-01-30 12:54:49 +08:00

			`def remove_lora_layers(model: nn.Module) -> nn.Module:`
			`"""`
			`Remove the LoRA layers from the model.`

			`Args:`
			`model (nn.Module): The model with LoRA layers.`

			`Returns:`
			`nn.Module: The model without LoRA layers.`
			`"""`
			`reset_layers = []`
			`for name, module in model.named_modules():`
			`if isinstance(module, LoRALinear):`
			`reset_layers.append((name, module.linear))`
			`if len(reset_layers) > 0:`
			`model.update_modules(tree_unflatten(reset_layers))`
			`return model`