mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-09-01 12:49:50 +08:00
nit in readme
This commit is contained in:
@@ -270,11 +270,13 @@ hf_dataset:
|
||||
- Arguments specified in `config` will be passed as keyword arguments to
|
||||
[`datasets.load_dataset`](https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset).
|
||||
|
||||
In general, for the `chat`, `tools` and `completions` formats, Hugging Face [chat
|
||||
templates](https://huggingface.co/docs/transformers/main/en/chat_templating) are used. This applies
|
||||
the model's chat template by default. If the model does not have a chat
|
||||
template, then Hugging Face will use a default. For example, the final text in
|
||||
the `chat` example above with Hugging Face's default template becomes:
|
||||
In general, for the `chat`, `tools` and `completions` formats, Hugging Face
|
||||
[chat
|
||||
templates](https://huggingface.co/docs/transformers/main/en/chat_templating)
|
||||
are used. This applies the model's chat template by default. If the model does
|
||||
not have a chat template, then Hugging Face will use a default. For example,
|
||||
the final text in the `chat` example above with Hugging Face's default template
|
||||
becomes:
|
||||
|
||||
```text
|
||||
<|im_start|>system
|
||||
|
@@ -35,14 +35,9 @@ class ChatDataset(Dataset):
|
||||
|
||||
def __getitem__(self, idx: int):
|
||||
messages = self._data[idx]["messages"]
|
||||
|
||||
tools = None
|
||||
if "tools" in self._data[idx]:
|
||||
tools = self._data[idx]["tools"]
|
||||
|
||||
text = self._tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tools=tools,
|
||||
tools=self._data[idx].get("tools", None),
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
@@ -93,9 +93,7 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
|
||||
# Encode batch
|
||||
batch = [tokenizer.encode(dataset[j]) for j in batch_idx[i]]
|
||||
for b in batch:
|
||||
if b[-1] == tokenizer.eos_token_id:
|
||||
print("[WARNING] Example already has an EOS token appended")
|
||||
else:
|
||||
if b[-1] != tokenizer.eos_token_id:
|
||||
b.append(tokenizer.eos_token_id)
|
||||
|
||||
lengths = [len(x) for x in batch]
|
||||
|
Reference in New Issue
Block a user