From 3f08dfc762d8b72875f5ac60a9298651634970e1 Mon Sep 17 00:00:00 2001 From: Chime Ogbuji Date: Sun, 10 Nov 2024 10:08:44 -0500 Subject: [PATCH] Don't dupe BOS Ensure completion batching doesn't allow BOS dupes for instruction models with chat models whose tokenizer configurations have ```add_bos_token = True``` (see: 1095) --- llms/mlx_lm/tuner/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llms/mlx_lm/tuner/trainer.py b/llms/mlx_lm/tuner/trainer.py index 99cab169..a8a28256 100644 --- a/llms/mlx_lm/tuner/trainer.py +++ b/llms/mlx_lm/tuner/trainer.py @@ -166,7 +166,7 @@ def iterate_completion_batches( for j in batch_idx[i]: prompt, completion = dataset.get_prompt_and_completion(j) prompt_lengths.append(input_length(prompt, completion, tokenizer)) - full_sequence = tokenizer.encode(dataset[j]) + full_sequence = tokenizer.encode(dataset[j], add_special_tokens=False) if full_sequence[-1] != tokenizer.eos_token_id: full_sequence.append(tokenizer.eos_token_id) batch.append(full_sequence)