From 46fd8b7c53300cdab0e1f1a2ad33e44d282851d3 Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Thu, 12 Dec 2024 13:13:50 -0800 Subject: [PATCH] * rebase with main --- llms/mlx_lm/_version.py | 2 +- llms/mlx_lm/tokenizer_utils.py | 23 +++++++++-------------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/llms/mlx_lm/_version.py b/llms/mlx_lm/_version.py index 0f885fba..3af2d5fd 100644 --- a/llms/mlx_lm/_version.py +++ b/llms/mlx_lm/_version.py @@ -1,3 +1,3 @@ # Copyright © 2023-2024 Apple Inc. -__version__ = "0.20.2" +__version__ = "0.20.4" diff --git a/llms/mlx_lm/tokenizer_utils.py b/llms/mlx_lm/tokenizer_utils.py index 563f0052..a28b4452 100644 --- a/llms/mlx_lm/tokenizer_utils.py +++ b/llms/mlx_lm/tokenizer_utils.py @@ -197,21 +197,16 @@ class BPEStreamingDetokenizer(StreamingDetokenizer): self.tokens.append(token) v = self.tokenmap[token] is_added = token in self._added_ids - if is_added or self._byte_decoder[v[0]] == 32: - current_text = bytearray( - self._byte_decoder[c] for c in self._unflushed - ).decode("utf-8", "replace") - self.text += self._maybe_trim_space(current_text) - if is_added: - # We need to manually encode and decode the added tokens in case special characters - # used for `\n` / `\t` have been manually added in the added tokens - v = self.tokenizer.decode(self.tokenizer.encode(v)) - self.text += v - self._unflushed = "" - else: - self._unflushed = v - else: + if not is_added: self._unflushed += v + text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode( + "utf-8", "replace" + ) + if is_added: + text += v + if not text.endswith("\ufffd"): + self.text += self._maybe_trim_space(text) + self._unflushed = "" def finalize(self): current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(