mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-09-01 12:49:50 +08:00
feat(mlx-lm): add de-quant for fuse.py (#365)
* feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found
This commit is contained in:
@@ -8,7 +8,7 @@ from typing import Any, Dict, Union
|
||||
from mlx.utils import tree_flatten, tree_unflatten
|
||||
|
||||
from .tuner.lora import LoRALinear
|
||||
from .tuner.utils import apply_lora_layers
|
||||
from .tuner.utils import apply_lora_layers, dequantize
|
||||
from .utils import fetch_from_hub, get_model_path, save_weights, upload_to_hub
|
||||
|
||||
|
||||
@@ -42,6 +42,11 @@ def parse_arguments() -> argparse.Namespace:
|
||||
type=str,
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--de-quantize",
|
||||
help="Generate a de-quantized model.",
|
||||
action="store_true",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -54,6 +59,7 @@ def main() -> None:
|
||||
|
||||
model.freeze()
|
||||
model = apply_lora_layers(model, args.adapter_file)
|
||||
|
||||
fused_linears = [
|
||||
(n, m.to_linear())
|
||||
for n, m in model.named_modules()
|
||||
@@ -61,6 +67,11 @@ def main() -> None:
|
||||
]
|
||||
|
||||
model.update_modules(tree_unflatten(fused_linears))
|
||||
|
||||
if args.de_quantize:
|
||||
print("De-quantizing model")
|
||||
model = dequantize(model)
|
||||
|
||||
weights = dict(tree_flatten(model.parameters()))
|
||||
|
||||
save_path = Path(args.save_path)
|
||||
@@ -73,6 +84,9 @@ def main() -> None:
|
||||
|
||||
tokenizer.save_pretrained(save_path)
|
||||
|
||||
if args.de_quantize:
|
||||
config.pop("quantization", None)
|
||||
|
||||
with open(save_path / "config.json", "w") as fid:
|
||||
json.dump(config, fid, indent=4)
|
||||
|
||||
|
Reference in New Issue
Block a user