mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-06-24 17:31:18 +08:00
feat(mlx-lm): add de-quant for fuse.py (#365)
* feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found
This commit is contained in:
parent
f51e98fcf1
commit
854ad8747a
@ -8,7 +8,7 @@ from typing import Any, Dict, Union
|
|||||||
from mlx.utils import tree_flatten, tree_unflatten
|
from mlx.utils import tree_flatten, tree_unflatten
|
||||||
|
|
||||||
from .tuner.lora import LoRALinear
|
from .tuner.lora import LoRALinear
|
||||||
from .tuner.utils import apply_lora_layers
|
from .tuner.utils import apply_lora_layers, dequantize
|
||||||
from .utils import fetch_from_hub, get_model_path, save_weights, upload_to_hub
|
from .utils import fetch_from_hub, get_model_path, save_weights, upload_to_hub
|
||||||
|
|
||||||
|
|
||||||
@ -42,6 +42,11 @@ def parse_arguments() -> argparse.Namespace:
|
|||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--de-quantize",
|
||||||
|
help="Generate a de-quantized model.",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -54,6 +59,7 @@ def main() -> None:
|
|||||||
|
|
||||||
model.freeze()
|
model.freeze()
|
||||||
model = apply_lora_layers(model, args.adapter_file)
|
model = apply_lora_layers(model, args.adapter_file)
|
||||||
|
|
||||||
fused_linears = [
|
fused_linears = [
|
||||||
(n, m.to_linear())
|
(n, m.to_linear())
|
||||||
for n, m in model.named_modules()
|
for n, m in model.named_modules()
|
||||||
@ -61,6 +67,11 @@ def main() -> None:
|
|||||||
]
|
]
|
||||||
|
|
||||||
model.update_modules(tree_unflatten(fused_linears))
|
model.update_modules(tree_unflatten(fused_linears))
|
||||||
|
|
||||||
|
if args.de_quantize:
|
||||||
|
print("De-quantizing model")
|
||||||
|
model = dequantize(model)
|
||||||
|
|
||||||
weights = dict(tree_flatten(model.parameters()))
|
weights = dict(tree_flatten(model.parameters()))
|
||||||
|
|
||||||
save_path = Path(args.save_path)
|
save_path = Path(args.save_path)
|
||||||
@ -73,6 +84,9 @@ def main() -> None:
|
|||||||
|
|
||||||
tokenizer.save_pretrained(save_path)
|
tokenizer.save_pretrained(save_path)
|
||||||
|
|
||||||
|
if args.de_quantize:
|
||||||
|
config.pop("quantization", None)
|
||||||
|
|
||||||
with open(save_path / "config.json", "w") as fid:
|
with open(save_path / "config.json", "w") as fid:
|
||||||
json.dump(config, fid, indent=4)
|
json.dump(config, fid, indent=4)
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ class LoRALinear(nn.Module):
|
|||||||
lora_lin.linear = linear
|
lora_lin.linear = linear
|
||||||
return lora_lin
|
return lora_lin
|
||||||
|
|
||||||
def to_linear(self):
|
def to_linear(self, de_quantize: bool = False):
|
||||||
linear = self.linear
|
linear = self.linear
|
||||||
bias = "bias" in linear
|
bias = "bias" in linear
|
||||||
weight = linear.weight
|
weight = linear.weight
|
||||||
@ -56,7 +56,7 @@ class LoRALinear(nn.Module):
|
|||||||
if bias:
|
if bias:
|
||||||
fused_linear.bias = linear.bias
|
fused_linear.bias = linear.bias
|
||||||
|
|
||||||
if is_quantized:
|
if is_quantized and not de_quantize:
|
||||||
fused_linear = nn.QuantizedLinear.from_linear(
|
fused_linear = nn.QuantizedLinear.from_linear(
|
||||||
fused_linear,
|
fused_linear,
|
||||||
linear.group_size,
|
linear.group_size,
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
import mlx.core as mx
|
import mlx.core as mx
|
||||||
import mlx.nn as nn
|
import mlx.nn as nn
|
||||||
from mlx.utils import tree_unflatten
|
from mlx.utils import tree_unflatten
|
||||||
@ -6,18 +8,62 @@ from .lora import LoRALinear
|
|||||||
|
|
||||||
|
|
||||||
def apply_lora_layers(model: nn.Module, adapter_file: str) -> nn.Module:
|
def apply_lora_layers(model: nn.Module, adapter_file: str) -> nn.Module:
|
||||||
|
"""
|
||||||
|
Apply LoRA layers to the model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (nn.Module): The neural network model.
|
||||||
|
adapter_file (str): Path to the adapter configuration file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
nn.Module: The updated model with LoRA layers applied.
|
||||||
|
"""
|
||||||
|
if not os.path.exists(adapter_file):
|
||||||
|
raise FileNotFoundError(f"The adapter file does not exist: {adapter_file}")
|
||||||
|
|
||||||
adapters = list(mx.load(adapter_file).items())
|
adapters = list(mx.load(adapter_file).items())
|
||||||
linear_replacements = {}
|
|
||||||
|
linear_replacements = []
|
||||||
lora_layers = set(
|
lora_layers = set(
|
||||||
[name.replace(".lora_a", "").replace(".lora_b", "") for name, _ in adapters]
|
[name.replace(".lora_a", "").replace(".lora_b", "") for name, _ in adapters]
|
||||||
)
|
)
|
||||||
|
|
||||||
for name, module in model.named_modules():
|
for name, module in model.named_modules():
|
||||||
if name in lora_layers:
|
if name in lora_layers:
|
||||||
replacement_module = LoRALinear.from_linear(module)
|
replacement_module = LoRALinear.from_linear(module)
|
||||||
linear_replacements[name] = replacement_module
|
linear_replacements.append((name, replacement_module))
|
||||||
|
|
||||||
model.update_modules(tree_unflatten(list(linear_replacements.items())))
|
model.update_modules(tree_unflatten(linear_replacements))
|
||||||
|
return model
|
||||||
model.update(tree_unflatten(adapters))
|
|
||||||
|
|
||||||
|
def dequantize(model: nn.Module) -> nn.Module:
|
||||||
|
"""
|
||||||
|
Dequantize the quantized linear layers in the model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (nn.Module): The model with quantized linear layers.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
nn.Module: The model with dequantized layers.
|
||||||
|
"""
|
||||||
|
de_quantize_layers = []
|
||||||
|
for n, m in model.named_modules():
|
||||||
|
if isinstance(m, nn.QuantizedLinear):
|
||||||
|
bias = "bias" in m
|
||||||
|
weight = m.weight
|
||||||
|
weight = mx.dequantize(
|
||||||
|
weight,
|
||||||
|
m.scales,
|
||||||
|
m.biases,
|
||||||
|
m.group_size,
|
||||||
|
m.bits,
|
||||||
|
).astype(mx.float16)
|
||||||
|
output_dims, input_dims = weight.shape
|
||||||
|
linear = nn.Linear(input_dims, output_dims, bias=bias)
|
||||||
|
linear.weight = weight
|
||||||
|
if bias:
|
||||||
|
linear.bias = m.bias
|
||||||
|
de_quantize_layers.append((n, linear))
|
||||||
|
if len(de_quantize_layers) > 0:
|
||||||
|
model.update_modules(tree_unflatten(de_quantize_layers))
|
||||||
return model
|
return model
|
||||||
|
@ -24,7 +24,7 @@ MODEL_MAPPING = {
|
|||||||
"qwen": qwen,
|
"qwen": qwen,
|
||||||
"plamo": plamo,
|
"plamo": plamo,
|
||||||
}
|
}
|
||||||
MAX_FILE_SIZE_GB = 15
|
MAX_FILE_SIZE_GB = 5
|
||||||
|
|
||||||
linear_class_predicate = (
|
linear_class_predicate = (
|
||||||
lambda m: isinstance(m, nn.Linear)
|
lambda m: isinstance(m, nn.Linear)
|
||||||
|
Loading…
Reference in New Issue
Block a user