From 6df285ef6c336023872f2c3c3a82c868e90d206f Mon Sep 17 00:00:00 2001 From: Chime Ogbuji Date: Fri, 6 Dec 2024 07:20:01 -0500 Subject: [PATCH] Synch use of special tokens with iterate_batches --- llms/mlx_lm/tuner/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llms/mlx_lm/tuner/trainer.py b/llms/mlx_lm/tuner/trainer.py index a8a28256..99cab169 100644 --- a/llms/mlx_lm/tuner/trainer.py +++ b/llms/mlx_lm/tuner/trainer.py @@ -166,7 +166,7 @@ def iterate_completion_batches( for j in batch_idx[i]: prompt, completion = dataset.get_prompt_and_completion(j) prompt_lengths.append(input_length(prompt, completion, tokenizer)) - full_sequence = tokenizer.encode(dataset[j], add_special_tokens=False) + full_sequence = tokenizer.encode(dataset[j]) if full_sequence[-1] != tokenizer.eos_token_id: full_sequence.append(tokenizer.eos_token_id) batch.append(full_sequence)