LoRA: support tools(function calling) format datasets (#995)

* LoRA: support fine-tuning tools datasets * LoRA: Split small function * LoRA: add tools format to lora docs * LoRA: pre-commit fix * Revert "LoRA: pre-commit fix" This reverts commit b94b7e0fe7. * Revert "LoRA: Split small function" This reverts commit 3f6a5f19fd. * LoRA: remove ToolsDataset In a JSONL file, not all data is required to include the tools value. * nit in readme * nit in readme * nit in readme --------- Co-authored-by: Awni Hannun <awni@apple.com>
2025-12-16 02:08:55 +08:00 · 2024-09-29 01:41:36 +08:00
parent ace2bb5890
commit 7ec2021bb9
3 changed files with 66 additions and 11 deletions
--- a/llms/mlx_lm/tuner/datasets.py
+++ b/llms/mlx_lm/tuner/datasets.py
@@ -36,7 +36,10 @@ class ChatDataset(Dataset):
    def __getitem__(self, idx: int):
        messages = self._data[idx]["messages"]
        text = self._tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
+            messages,
+            tools=self._data[idx].get("tools", None),
+            tokenize=False,
+            add_generation_prompt=True,
        )
        return text

--- a/llms/mlx_lm/tuner/trainer.py
+++ b/llms/mlx_lm/tuner/trainer.py
@@ -93,9 +93,7 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
            # Encode batch
            batch = [tokenizer.encode(dataset[j]) for j in batch_idx[i]]
            for b in batch:
-                if b[-1] == tokenizer.eos_token_id:
-                    print("[WARNING] Example already has an EOS token appended")
-                else:
+                if b[-1] != tokenizer.eos_token_id:
                    b.append(tokenizer.eos_token_id)

            lengths = [len(x) for x in batch]