From 0007b019d9480d5d0de3e8ca853ef771e9c40e4b Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Tue, 17 Dec 2024 09:45:42 -0800 Subject: [PATCH] no lag bpe --- llms/mlx_lm/tokenizer_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llms/mlx_lm/tokenizer_utils.py b/llms/mlx_lm/tokenizer_utils.py index 2b80c349..ca3d6c06 100644 --- a/llms/mlx_lm/tokenizer_utils.py +++ b/llms/mlx_lm/tokenizer_utils.py @@ -127,23 +127,23 @@ class SPMStreamingDetokenizer(StreamingDetokenizer): self.text = "" self.tokens = [] - def _flush(self): + def _try_flush(self, force=False): text = self._unflushed.replace(self._sep, b" ").decode("utf-8", "replace") + if not force and text.endswith("\ufffd"): + return if not self.text and self.trim_space and text and text[0] == " ": text = text[1:] self.text += text + self._unflushed = b"" def add_token(self, token): self.tokens.append(token) v = self.tokenmap[token] - if v.startswith(self._sep): - self._flush() - self._unflushed = v - else: - self._unflushed += v + self._unflushed += v + self._try_flush() def finalize(self): - self._flush() + self._try_flush(force=True) self._unflushed = b""