feat(mlx-lm): add de-quant for fuse.py (#365)

* feat(mlx-lm): add de-quant for fuse * chore: disable quant in to linear when de-quant enabled * chore: add better error handling for adapter file not found
2025-12-16 02:08:55 +08:00 · 2024-01-26 13:59:32 +11:00
parent f51e98fcf1
commit 854ad8747a
4 changed files with 70 additions and 10 deletions
--- a/llms/mlx_lm/tuner/lora.py
+++ b/llms/mlx_lm/tuner/lora.py
@@ -29,7 +29,7 @@ class LoRALinear(nn.Module):
        lora_lin.linear = linear
        return lora_lin

-    def to_linear(self):
+    def to_linear(self, de_quantize: bool = False):
        linear = self.linear
        bias = "bias" in linear
        weight = linear.weight
@@ -56,7 +56,7 @@ class LoRALinear(nn.Module):
        if bias:
            fused_linear.bias = linear.bias

-        if is_quantized:
+        if is_quantized and not de_quantize:
            fused_linear = nn.QuantizedLinear.from_linear(
                fused_linear,
                linear.group_size,