feat(mlx-lm): add de-quant for fuse.py (#365)

* feat(mlx-lm): add de-quant for fuse

* chore: disable quant in to linear when de-quant enabled

* chore: add better error handling for adapter file not found
This commit is contained in:
Anchen 2024-01-26 13:59:32 +11:00 committed by GitHub
parent f51e98fcf1
commit 854ad8747a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 70 additions and 10 deletions

View File

@ -8,7 +8,7 @@ from typing import Any, Dict, Union
from mlx.utils import tree_flatten, tree_unflatten from mlx.utils import tree_flatten, tree_unflatten
from .tuner.lora import LoRALinear from .tuner.lora import LoRALinear
from .tuner.utils import apply_lora_layers from .tuner.utils import apply_lora_layers, dequantize
from .utils import fetch_from_hub, get_model_path, save_weights, upload_to_hub from .utils import fetch_from_hub, get_model_path, save_weights, upload_to_hub
@ -42,6 +42,11 @@ def parse_arguments() -> argparse.Namespace:
type=str, type=str,
default=None, default=None,
) )
parser.add_argument(
"--de-quantize",
help="Generate a de-quantized model.",
action="store_true",
)
return parser.parse_args() return parser.parse_args()
@ -54,6 +59,7 @@ def main() -> None:
model.freeze() model.freeze()
model = apply_lora_layers(model, args.adapter_file) model = apply_lora_layers(model, args.adapter_file)
fused_linears = [ fused_linears = [
(n, m.to_linear()) (n, m.to_linear())
for n, m in model.named_modules() for n, m in model.named_modules()
@ -61,6 +67,11 @@ def main() -> None:
] ]
model.update_modules(tree_unflatten(fused_linears)) model.update_modules(tree_unflatten(fused_linears))
if args.de_quantize:
print("De-quantizing model")
model = dequantize(model)
weights = dict(tree_flatten(model.parameters())) weights = dict(tree_flatten(model.parameters()))
save_path = Path(args.save_path) save_path = Path(args.save_path)
@ -73,6 +84,9 @@ def main() -> None:
tokenizer.save_pretrained(save_path) tokenizer.save_pretrained(save_path)
if args.de_quantize:
config.pop("quantization", None)
with open(save_path / "config.json", "w") as fid: with open(save_path / "config.json", "w") as fid:
json.dump(config, fid, indent=4) json.dump(config, fid, indent=4)

View File

@ -29,7 +29,7 @@ class LoRALinear(nn.Module):
lora_lin.linear = linear lora_lin.linear = linear
return lora_lin return lora_lin
def to_linear(self): def to_linear(self, de_quantize: bool = False):
linear = self.linear linear = self.linear
bias = "bias" in linear bias = "bias" in linear
weight = linear.weight weight = linear.weight
@ -56,7 +56,7 @@ class LoRALinear(nn.Module):
if bias: if bias:
fused_linear.bias = linear.bias fused_linear.bias = linear.bias
if is_quantized: if is_quantized and not de_quantize:
fused_linear = nn.QuantizedLinear.from_linear( fused_linear = nn.QuantizedLinear.from_linear(
fused_linear, fused_linear,
linear.group_size, linear.group_size,

View File

@ -1,3 +1,5 @@
import os
import mlx.core as mx import mlx.core as mx
import mlx.nn as nn import mlx.nn as nn
from mlx.utils import tree_unflatten from mlx.utils import tree_unflatten
@ -6,18 +8,62 @@ from .lora import LoRALinear
def apply_lora_layers(model: nn.Module, adapter_file: str) -> nn.Module: def apply_lora_layers(model: nn.Module, adapter_file: str) -> nn.Module:
"""
Apply LoRA layers to the model.
Args:
model (nn.Module): The neural network model.
adapter_file (str): Path to the adapter configuration file.
Returns:
nn.Module: The updated model with LoRA layers applied.
"""
if not os.path.exists(adapter_file):
raise FileNotFoundError(f"The adapter file does not exist: {adapter_file}")
adapters = list(mx.load(adapter_file).items()) adapters = list(mx.load(adapter_file).items())
linear_replacements = {}
linear_replacements = []
lora_layers = set( lora_layers = set(
[name.replace(".lora_a", "").replace(".lora_b", "") for name, _ in adapters] [name.replace(".lora_a", "").replace(".lora_b", "") for name, _ in adapters]
) )
for name, module in model.named_modules(): for name, module in model.named_modules():
if name in lora_layers: if name in lora_layers:
replacement_module = LoRALinear.from_linear(module) replacement_module = LoRALinear.from_linear(module)
linear_replacements[name] = replacement_module linear_replacements.append((name, replacement_module))
model.update_modules(tree_unflatten(list(linear_replacements.items()))) model.update_modules(tree_unflatten(linear_replacements))
return model
model.update(tree_unflatten(adapters))
def dequantize(model: nn.Module) -> nn.Module:
"""
Dequantize the quantized linear layers in the model.
Args:
model (nn.Module): The model with quantized linear layers.
Returns:
nn.Module: The model with dequantized layers.
"""
de_quantize_layers = []
for n, m in model.named_modules():
if isinstance(m, nn.QuantizedLinear):
bias = "bias" in m
weight = m.weight
weight = mx.dequantize(
weight,
m.scales,
m.biases,
m.group_size,
m.bits,
).astype(mx.float16)
output_dims, input_dims = weight.shape
linear = nn.Linear(input_dims, output_dims, bias=bias)
linear.weight = weight
if bias:
linear.bias = m.bias
de_quantize_layers.append((n, linear))
if len(de_quantize_layers) > 0:
model.update_modules(tree_unflatten(de_quantize_layers))
return model return model

View File

@ -24,7 +24,7 @@ MODEL_MAPPING = {
"qwen": qwen, "qwen": qwen,
"plamo": plamo, "plamo": plamo,
} }
MAX_FILE_SIZE_GB = 15 MAX_FILE_SIZE_GB = 5
linear_class_predicate = ( linear_class_predicate = (
lambda m: isinstance(m, nn.Linear) lambda m: isinstance(m, nn.Linear)