From 76fac6eee069507b377e7e26f89ff41cf0a751f3 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Thu, 5 Dec 2024 22:56:49 +0400 Subject: [PATCH] Add a fix for special added tokens --- llms/mlx_lm/tokenizer_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llms/mlx_lm/tokenizer_utils.py b/llms/mlx_lm/tokenizer_utils.py index 9d390733..9333ce94 100644 --- a/llms/mlx_lm/tokenizer_utils.py +++ b/llms/mlx_lm/tokenizer_utils.py @@ -167,6 +167,8 @@ class BPEStreamingDetokenizer(StreamingDetokenizer): def __init__(self, tokenizer): + self.tokenizer = tokenizer + self.clean_spaces = tokenizer.clean_up_tokenization_spaces # Extract the tokens in a list from id to text @@ -208,6 +210,9 @@ class BPEStreamingDetokenizer(StreamingDetokenizer): ).decode("utf-8") self.text += self._maybe_trim_space(current_text) if is_added: + # We need to manually encode and decode the added tokens in case special characters + # used for `\n` / `\t` have been manually added in the added tokens + v = self.tokenizer.decode(self.tokenizer.encode(v)) self.text += v self._unflushed = "" else: