Bpe stream without space (#1154)

* bpe streaming detokenization without space

* version bump
This commit is contained in:
Awni Hannun 2024-12-12 13:13:50 -08:00 committed by GitHub
parent 2ba0e36683
commit 9f2ea5892e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 10 additions and 12 deletions

View File

@ -1,3 +1,3 @@
# Copyright © 2023-2024 Apple Inc.
__version__ = "0.20.2"
__version__ = "0.20.4"

View File

@ -195,18 +195,16 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
self.tokens.append(token)
v = self.tokenmap[token]
is_added = token in self._added_ids
if is_added or self._byte_decoder[v[0]] == 32:
current_text = bytearray(
self._byte_decoder[c] for c in self._unflushed
).decode("utf-8", "replace")
self.text += self._maybe_trim_space(current_text)
if is_added:
self.text += v
self._unflushed = ""
else:
self._unflushed = v
else:
if not is_added:
self._unflushed += v
text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
"utf-8", "replace"
)
if is_added:
text += v
if not text.endswith("\ufffd"):
self.text += self._maybe_trim_space(text)
self._unflushed = ""
def finalize(self):
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(