nit in readme

2025-09-01 12:49:50 +08:00 · 2024-09-28 10:36:54 -07:00
parent f01dbe89e8
commit bdad0cd4a6
3 changed files with 9 additions and 14 deletions
--- a/llms/mlx_lm/LORA.md
+++ b/llms/mlx_lm/LORA.md
@@ -270,11 +270,13 @@ hf_dataset:
 - Arguments specified in `config` will be passed as keyword arguments to
  [`datasets.load_dataset`](https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset).

-In general, for the `chat`, `tools` and `completions` formats, Hugging Face [chat
-templates](https://huggingface.co/docs/transformers/main/en/chat_templating) are used. This applies
-the model's chat template by default. If the model does not have a chat
-template, then Hugging Face will use a default. For example, the final text in
-the `chat` example above with Hugging Face's default template becomes:
+In general, for the `chat`, `tools` and `completions` formats, Hugging Face
+[chat
+templates](https://huggingface.co/docs/transformers/main/en/chat_templating)
+are used. This applies the model's chat template by default. If the model does
+not have a chat template, then Hugging Face will use a default. For example,
+the final text in the `chat` example above with Hugging Face's default template
+becomes:

 ```text
 <|im_start|>system
--- a/llms/mlx_lm/tuner/datasets.py
+++ b/llms/mlx_lm/tuner/datasets.py
@@ -35,14 +35,9 @@ class ChatDataset(Dataset):

    def __getitem__(self, idx: int):
        messages = self._data[idx]["messages"]
-
-        tools = None
-        if "tools" in self._data[idx]:
-            tools = self._data[idx]["tools"]
-
        text = self._tokenizer.apply_chat_template(
            messages,
-            tools=tools,
+            tools=self._data[idx].get("tools", None),
            tokenize=False,
            add_generation_prompt=True,
        )
--- a/llms/mlx_lm/tuner/trainer.py
+++ b/llms/mlx_lm/tuner/trainer.py
@@ -93,9 +93,7 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
            # Encode batch
            batch = [tokenizer.encode(dataset[j]) for j in batch_idx[i]]
            for b in batch:
-                if b[-1] == tokenizer.eos_token_id:
-                    print("[WARNING] Example already has an EOS token appended")
-                else:
+                if b[-1] != tokenizer.eos_token_id:
                    b.append(tokenizer.eos_token_id)

            lengths = [len(x) for x in batch]