bpe streaming detokenization without space

This commit is contained in:
Awni Hannun 2024-12-12 12:33:36 -08:00
parent 2ba0e36683
commit e2ba4ceaab

View File

@ -195,18 +195,16 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
self.tokens.append(token)
v = self.tokenmap[token]
is_added = token in self._added_ids
if is_added or self._byte_decoder[v[0]] == 32:
current_text = bytearray(
self._byte_decoder[c] for c in self._unflushed
).decode("utf-8", "replace")
self.text += self._maybe_trim_space(current_text)
if is_added:
self.text += v
self._unflushed = ""
else:
self._unflushed = v
else:
if not is_added:
self._unflushed += v
text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
"utf-8", "replace"
)
if is_added:
text += v
if not text.endswith("\ufffd"):
self.text += self._maybe_trim_space(text)
self._unflushed = ""
def finalize(self):
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(