Quantize embedding / Update quantize API (#680)

* more async eval * quantize embedding / update quantize api * more updates for quantize * update for quantize embeddings * update sd quant API * update sdxl quants * error for datasets < batch_size * async * fix config loading * fix quant * fix tests * fix req * remove lm head if tie weights is true * fix test
2025-10-23 05:58:07 +08:00 · 2024-04-18 18:16:10 -07:00
parent f5f189e48a
commit 2146bcd7ee
28 changed files with 108 additions and 190 deletions
--- a/llms/mixtral/convert.py
+++ b/llms/mixtral/convert.py
@@ -60,13 +60,10 @@ def quantize(weights, config, args):
    model.update(all_weights)

    # Quantize the model:
-    nn.QuantizedLinear.quantize_module(
+    nn.quantize(
        model,
        args.q_group_size,
        args.q_bits,
-        # TODO: Quantize gate matrices when < 32 tiles supported
-        linear_class_predicate=lambda m: isinstance(m, nn.Linear)
-        and m.weight.shape[0] != 8,
    )

    # Extract the subset of quantized weights:
--- a/llms/mixtral/mixtral.py
+++ b/llms/mixtral/mixtral.py
@@ -217,11 +217,7 @@ def load_model(folder: str):
    weights = tree_unflatten(list(weights.items()))
    model = Mixtral(model_args)
    if quantization is not None:
-        # TODO: Quantize gate matrices when < 32 tiles supported
-        quantization["linear_class_predicate"] = (
-            lambda m: isinstance(m, nn.Linear) and m.weight.shape[0] != 8
-        )
-        nn.QuantizedLinear.quantize_module(model, **quantization)
+        nn.quantize(model, **quantization)

    model.update(weights)
    return model, tokenizer
--- a/llms/mixtral/requirements.txt
+++ b/llms/mixtral/requirements.txt
@@ -1,4 +1,4 @@
-mlx>=0.8.0
+mlx>=0.11.0
 sentencepiece
 torch
 numpy