* rebase with main

This commit is contained in:
Awni Hannun 2024-12-12 13:13:50 -08:00 committed by Billel Mokeddem
parent 1026cc5608
commit 46fd8b7c53
2 changed files with 10 additions and 15 deletions

View File

@ -1,3 +1,3 @@
# Copyright © 2023-2024 Apple Inc.
__version__ = "0.20.2"
__version__ = "0.20.4"

View File

@ -197,21 +197,16 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
self.tokens.append(token)
v = self.tokenmap[token]
is_added = token in self._added_ids
if is_added or self._byte_decoder[v[0]] == 32:
current_text = bytearray(
self._byte_decoder[c] for c in self._unflushed
).decode("utf-8", "replace")
self.text += self._maybe_trim_space(current_text)
if is_added:
# We need to manually encode and decode the added tokens in case special characters
# used for `\n` / `\t` have been manually added in the added tokens
v = self.tokenizer.decode(self.tokenizer.encode(v))
self.text += v
self._unflushed = ""
else:
self._unflushed = v
else:
if not is_added:
self._unflushed += v
text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
"utf-8", "replace"
)
if is_added:
text += v
if not text.endswith("\ufffd"):
self.text += self._maybe_trim_space(text)
self._unflushed = ""
def finalize(self):
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(