mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-07-18 16:31:12 +08:00
Bpe stream without space (#1154)
* bpe streaming detokenization without space * version bump
This commit is contained in:
parent
2ba0e36683
commit
9f2ea5892e
@ -1,3 +1,3 @@
|
|||||||
# Copyright © 2023-2024 Apple Inc.
|
# Copyright © 2023-2024 Apple Inc.
|
||||||
|
|
||||||
__version__ = "0.20.2"
|
__version__ = "0.20.4"
|
||||||
|
@ -195,18 +195,16 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
|
|||||||
self.tokens.append(token)
|
self.tokens.append(token)
|
||||||
v = self.tokenmap[token]
|
v = self.tokenmap[token]
|
||||||
is_added = token in self._added_ids
|
is_added = token in self._added_ids
|
||||||
if is_added or self._byte_decoder[v[0]] == 32:
|
if not is_added:
|
||||||
current_text = bytearray(
|
|
||||||
self._byte_decoder[c] for c in self._unflushed
|
|
||||||
).decode("utf-8", "replace")
|
|
||||||
self.text += self._maybe_trim_space(current_text)
|
|
||||||
if is_added:
|
|
||||||
self.text += v
|
|
||||||
self._unflushed = ""
|
|
||||||
else:
|
|
||||||
self._unflushed = v
|
|
||||||
else:
|
|
||||||
self._unflushed += v
|
self._unflushed += v
|
||||||
|
text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
|
||||||
|
"utf-8", "replace"
|
||||||
|
)
|
||||||
|
if is_added:
|
||||||
|
text += v
|
||||||
|
if not text.endswith("\ufffd"):
|
||||||
|
self.text += self._maybe_trim_space(text)
|
||||||
|
self._unflushed = ""
|
||||||
|
|
||||||
def finalize(self):
|
def finalize(self):
|
||||||
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
|
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
|
||||||
|
Loading…
Reference in New Issue
Block a user