mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-10-24 06:28:07 +08:00
feat(mlx-lm): add de-quant for fuse.py (#365)
* feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found
This commit is contained in:
@@ -29,7 +29,7 @@ class LoRALinear(nn.Module):
|
||||
lora_lin.linear = linear
|
||||
return lora_lin
|
||||
|
||||
def to_linear(self):
|
||||
def to_linear(self, de_quantize: bool = False):
|
||||
linear = self.linear
|
||||
bias = "bias" in linear
|
||||
weight = linear.weight
|
||||
@@ -56,7 +56,7 @@ class LoRALinear(nn.Module):
|
||||
if bias:
|
||||
fused_linear.bias = linear.bias
|
||||
|
||||
if is_quantized:
|
||||
if is_quantized and not de_quantize:
|
||||
fused_linear = nn.QuantizedLinear.from_linear(
|
||||
fused_linear,
|
||||
linear.group_size,
|
||||
|
@@ -1,3 +1,5 @@
|
||||
import os
|
||||
|
||||
import mlx.core as mx
|
||||
import mlx.nn as nn
|
||||
from mlx.utils import tree_unflatten
|
||||
@@ -6,18 +8,62 @@ from .lora import LoRALinear
|
||||
|
||||
|
||||
def apply_lora_layers(model: nn.Module, adapter_file: str) -> nn.Module:
|
||||
"""
|
||||
Apply LoRA layers to the model.
|
||||
|
||||
Args:
|
||||
model (nn.Module): The neural network model.
|
||||
adapter_file (str): Path to the adapter configuration file.
|
||||
|
||||
Returns:
|
||||
nn.Module: The updated model with LoRA layers applied.
|
||||
"""
|
||||
if not os.path.exists(adapter_file):
|
||||
raise FileNotFoundError(f"The adapter file does not exist: {adapter_file}")
|
||||
|
||||
adapters = list(mx.load(adapter_file).items())
|
||||
linear_replacements = {}
|
||||
|
||||
linear_replacements = []
|
||||
lora_layers = set(
|
||||
[name.replace(".lora_a", "").replace(".lora_b", "") for name, _ in adapters]
|
||||
)
|
||||
|
||||
for name, module in model.named_modules():
|
||||
if name in lora_layers:
|
||||
replacement_module = LoRALinear.from_linear(module)
|
||||
linear_replacements[name] = replacement_module
|
||||
linear_replacements.append((name, replacement_module))
|
||||
|
||||
model.update_modules(tree_unflatten(list(linear_replacements.items())))
|
||||
|
||||
model.update(tree_unflatten(adapters))
|
||||
model.update_modules(tree_unflatten(linear_replacements))
|
||||
return model
|
||||
|
||||
|
||||
def dequantize(model: nn.Module) -> nn.Module:
|
||||
"""
|
||||
Dequantize the quantized linear layers in the model.
|
||||
|
||||
Args:
|
||||
model (nn.Module): The model with quantized linear layers.
|
||||
|
||||
Returns:
|
||||
nn.Module: The model with dequantized layers.
|
||||
"""
|
||||
de_quantize_layers = []
|
||||
for n, m in model.named_modules():
|
||||
if isinstance(m, nn.QuantizedLinear):
|
||||
bias = "bias" in m
|
||||
weight = m.weight
|
||||
weight = mx.dequantize(
|
||||
weight,
|
||||
m.scales,
|
||||
m.biases,
|
||||
m.group_size,
|
||||
m.bits,
|
||||
).astype(mx.float16)
|
||||
output_dims, input_dims = weight.shape
|
||||
linear = nn.Linear(input_dims, output_dims, bias=bias)
|
||||
linear.weight = weight
|
||||
if bias:
|
||||
linear.bias = m.bias
|
||||
de_quantize_layers.append((n, linear))
|
||||
if len(de_quantize_layers) > 0:
|
||||
model.update_modules(tree_unflatten(de_quantize_layers))
|
||||
return model
|
||||
|
Reference in New Issue
Block a user