LoRA: support tools(function calling) format datasets (#995)

* LoRA: support fine-tuning tools datasets

* LoRA: Split small function

* LoRA: add tools format to lora docs

* LoRA: pre-commit fix

* Revert "LoRA: pre-commit fix"

This reverts commit b94b7e0fe7.

* Revert "LoRA: Split small function"

This reverts commit 3f6a5f19fd.

* LoRA: remove ToolsDataset

In a JSONL file, not all data is required to include the tools value.

* nit in readme

* nit in readme

* nit in readme

---------

Co-authored-by: Awni Hannun <awni@apple.com>
This commit is contained in:
madroid
2024-09-29 01:41:36 +08:00
committed by GitHub
parent ace2bb5890
commit 7ec2021bb9
3 changed files with 66 additions and 11 deletions

View File

@@ -36,7 +36,10 @@ class ChatDataset(Dataset):
def __getitem__(self, idx: int):
messages = self._data[idx]["messages"]
text = self._tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
messages,
tools=self._data[idx].get("tools", None),
tokenize=False,
add_generation_prompt=True,
)
return text

View File

@@ -93,9 +93,7 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
# Encode batch
batch = [tokenizer.encode(dataset[j]) for j in batch_idx[i]]
for b in batch:
if b[-1] == tokenizer.eos_token_id:
print("[WARNING] Example already has an EOS token appended")
else:
if b[-1] != tokenizer.eos_token_id:
b.append(tokenizer.eos_token_id)
lengths = [len(x) for x in batch]