mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-07-18 00:11:14 +08:00
Bpe stream without space (#1154)
* bpe streaming detokenization without space * version bump
This commit is contained in:
parent
2ba0e36683
commit
9f2ea5892e
@ -1,3 +1,3 @@
|
||||
# Copyright © 2023-2024 Apple Inc.
|
||||
|
||||
__version__ = "0.20.2"
|
||||
__version__ = "0.20.4"
|
||||
|
@ -195,18 +195,16 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
|
||||
self.tokens.append(token)
|
||||
v = self.tokenmap[token]
|
||||
is_added = token in self._added_ids
|
||||
if is_added or self._byte_decoder[v[0]] == 32:
|
||||
current_text = bytearray(
|
||||
self._byte_decoder[c] for c in self._unflushed
|
||||
).decode("utf-8", "replace")
|
||||
self.text += self._maybe_trim_space(current_text)
|
||||
if is_added:
|
||||
self.text += v
|
||||
self._unflushed = ""
|
||||
else:
|
||||
self._unflushed = v
|
||||
else:
|
||||
if not is_added:
|
||||
self._unflushed += v
|
||||
text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
|
||||
"utf-8", "replace"
|
||||
)
|
||||
if is_added:
|
||||
text += v
|
||||
if not text.endswith("\ufffd"):
|
||||
self.text += self._maybe_trim_space(text)
|
||||
self._unflushed = ""
|
||||
|
||||
def finalize(self):
|
||||
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
|
||||
|
Loading…
Reference in New Issue
Block a user