Move lora example to use the same model format / conversion as hf_llm (#252)

* huffing face the lora example to allow more models * fixes * comments * more readme nits * fusion + works better for qlora * nits' * comments
2025-12-11 15:06:34 +08:00 · 2024-01-09 11:14:52 -08:00
parent bbd7172eef
commit 7b258f33ac
10 changed files with 521 additions and 224 deletions
--- a/llms/hf_llm/README.md
+++ b/llms/hf_llm/README.md
@@ -60,7 +60,7 @@ You can convert (change the data type or quantize) models using the
 `convert.py` script. This script takes a Hugging Face repo as input and outputs
 a model directory (which you can optionally also upload to Hugging Face).

-For example, to make 4-bit quantized a model, run:
+For example, to make a 4-bit quantized model, run:

 ```
 python convert.py --hf-path <hf_repo> -q
@@ -73,5 +73,5 @@ python convert.py --help
 ```

 You can upload new models to the [Hugging Face MLX
-Community](https://huggingface.co/mlx-community) by specifying `--upload-name``
+Community](https://huggingface.co/mlx-community) by specifying `--upload-name`
 to `convert.py`.
--- a/llms/hf_llm/generate.py
+++ b/llms/hf_llm/generate.py
@@ -39,7 +39,6 @@ def generate(
            tic = time.time()

        tokens.append(token.item())
-        # if (n + 1) % 10 == 0:
        s = tokenizer.decode(tokens)
        print(s[skip:], end="", flush=True)
        skip = len(s)
--- a/llms/hf_llm/models.py
+++ b/llms/hf_llm/models.py
@@ -10,7 +10,6 @@ from typing import Dict, Optional, Tuple, Union
 import mlx.core as mx
 import mlx.nn as nn
 from huggingface_hub import snapshot_download
-from mlx.utils import tree_unflatten
 from transformers import AutoTokenizer


@@ -250,9 +249,7 @@ def load(path_or_hf_repo: str):
    model.load_weights(list(weights.items()))

    mx.eval(model.parameters())
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_path,
-    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return model, tokenizer