mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-07-19 09:31:13 +08:00
Replace unicode errors instead of raising exception (#1146)
This commit is contained in:
parent
06af3c9b0e
commit
19abf3dcaa
@ -3,8 +3,6 @@ from functools import partial
|
|||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
REPLACEMENT_CHAR = "\ufffd"
|
|
||||||
|
|
||||||
|
|
||||||
class StreamingDetokenizer:
|
class StreamingDetokenizer:
|
||||||
"""The streaming detokenizer interface so that we can detokenize one token at a time.
|
"""The streaming detokenizer interface so that we can detokenize one token at a time.
|
||||||
@ -51,11 +49,9 @@ class StreamingDetokenizer:
|
|||||||
def last_segment(self):
|
def last_segment(self):
|
||||||
"""Return the last segment of readable text since last time this property was accessed."""
|
"""Return the last segment of readable text since last time this property was accessed."""
|
||||||
text = self.text
|
text = self.text
|
||||||
if text and text[-1] != REPLACEMENT_CHAR:
|
segment = text[self.offset :]
|
||||||
segment = text[self.offset :]
|
self.offset = len(text)
|
||||||
self.offset = len(text)
|
return segment
|
||||||
return segment
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
class NaiveStreamingDetokenizer(StreamingDetokenizer):
|
class NaiveStreamingDetokenizer(StreamingDetokenizer):
|
||||||
@ -132,7 +128,7 @@ class SPMStreamingDetokenizer(StreamingDetokenizer):
|
|||||||
self.tokens = []
|
self.tokens = []
|
||||||
|
|
||||||
def _flush(self):
|
def _flush(self):
|
||||||
text = self._unflushed.replace(self._sep, b" ").decode("utf-8")
|
text = self._unflushed.replace(self._sep, b" ").decode("utf-8", "replace")
|
||||||
if not self.text and self.trim_space and text and text[0] == " ":
|
if not self.text and self.trim_space and text and text[0] == " ":
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
self.text += text
|
self.text += text
|
||||||
@ -202,7 +198,7 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
|
|||||||
if is_added or self._byte_decoder[v[0]] == 32:
|
if is_added or self._byte_decoder[v[0]] == 32:
|
||||||
current_text = bytearray(
|
current_text = bytearray(
|
||||||
self._byte_decoder[c] for c in self._unflushed
|
self._byte_decoder[c] for c in self._unflushed
|
||||||
).decode("utf-8")
|
).decode("utf-8", "replace")
|
||||||
self.text += self._maybe_trim_space(current_text)
|
self.text += self._maybe_trim_space(current_text)
|
||||||
if is_added:
|
if is_added:
|
||||||
self.text += v
|
self.text += v
|
||||||
@ -214,7 +210,8 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
|
|||||||
|
|
||||||
def finalize(self):
|
def finalize(self):
|
||||||
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
|
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
|
||||||
"utf-8"
|
"utf-8",
|
||||||
|
"replace",
|
||||||
)
|
)
|
||||||
self.text += self._maybe_trim_space(current_text)
|
self.text += self._maybe_trim_space(current_text)
|
||||||
self._unflushed = ""
|
self._unflushed = ""
|
||||||
|
Loading…
Reference in New Issue
Block a user