mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-09-01 04:14:38 +08:00
LoRA: support tools(function calling) format datasets (#995)
* LoRA: support fine-tuning tools datasets * LoRA: Split small function * LoRA: add tools format to lora docs * LoRA: pre-commit fix * Revert "LoRA: pre-commit fix" This reverts commitb94b7e0fe7
. * Revert "LoRA: Split small function" This reverts commit3f6a5f19fd
. * LoRA: remove ToolsDataset In a JSONL file, not all data is required to include the tools value. * nit in readme * nit in readme * nit in readme --------- Co-authored-by: Awni Hannun <awni@apple.com>
This commit is contained in:
@@ -36,7 +36,10 @@ class ChatDataset(Dataset):
|
||||
def __getitem__(self, idx: int):
|
||||
messages = self._data[idx]["messages"]
|
||||
text = self._tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
messages,
|
||||
tools=self._data[idx].get("tools", None),
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
return text
|
||||
|
||||
|
@@ -93,9 +93,7 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
|
||||
# Encode batch
|
||||
batch = [tokenizer.encode(dataset[j]) for j in batch_idx[i]]
|
||||
for b in batch:
|
||||
if b[-1] == tokenizer.eos_token_id:
|
||||
print("[WARNING] Example already has an EOS token appended")
|
||||
else:
|
||||
if b[-1] != tokenizer.eos_token_id:
|
||||
b.append(tokenizer.eos_token_id)
|
||||
|
||||
lengths = [len(x) for x in batch]
|
||||
|
Reference in New Issue
Block a user