mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-08-29 18:26:37 +08:00
* rebase with main
This commit is contained in:
parent
1026cc5608
commit
46fd8b7c53
@ -1,3 +1,3 @@
|
|||||||
# Copyright © 2023-2024 Apple Inc.
|
# Copyright © 2023-2024 Apple Inc.
|
||||||
|
|
||||||
__version__ = "0.20.2"
|
__version__ = "0.20.4"
|
||||||
|
@ -197,21 +197,16 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
|
|||||||
self.tokens.append(token)
|
self.tokens.append(token)
|
||||||
v = self.tokenmap[token]
|
v = self.tokenmap[token]
|
||||||
is_added = token in self._added_ids
|
is_added = token in self._added_ids
|
||||||
if is_added or self._byte_decoder[v[0]] == 32:
|
if not is_added:
|
||||||
current_text = bytearray(
|
|
||||||
self._byte_decoder[c] for c in self._unflushed
|
|
||||||
).decode("utf-8", "replace")
|
|
||||||
self.text += self._maybe_trim_space(current_text)
|
|
||||||
if is_added:
|
|
||||||
# We need to manually encode and decode the added tokens in case special characters
|
|
||||||
# used for `\n` / `\t` have been manually added in the added tokens
|
|
||||||
v = self.tokenizer.decode(self.tokenizer.encode(v))
|
|
||||||
self.text += v
|
|
||||||
self._unflushed = ""
|
|
||||||
else:
|
|
||||||
self._unflushed = v
|
|
||||||
else:
|
|
||||||
self._unflushed += v
|
self._unflushed += v
|
||||||
|
text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
|
||||||
|
"utf-8", "replace"
|
||||||
|
)
|
||||||
|
if is_added:
|
||||||
|
text += v
|
||||||
|
if not text.endswith("\ufffd"):
|
||||||
|
self.text += self._maybe_trim_space(text)
|
||||||
|
self._unflushed = ""
|
||||||
|
|
||||||
def finalize(self):
|
def finalize(self):
|
||||||
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
|
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
|
||||||
|
Loading…
Reference in New Issue
Block a user