Replace unicode errors instead of raising exception (#1146)

This commit is contained in:
Angelos Katharopoulos 2024-12-12 11:10:41 -08:00 committed by GitHub
parent 06af3c9b0e
commit 19abf3dcaa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -3,8 +3,6 @@ from functools import partial
from transformers import AutoTokenizer from transformers import AutoTokenizer
REPLACEMENT_CHAR = "\ufffd"
class StreamingDetokenizer: class StreamingDetokenizer:
"""The streaming detokenizer interface so that we can detokenize one token at a time. """The streaming detokenizer interface so that we can detokenize one token at a time.
@ -51,11 +49,9 @@ class StreamingDetokenizer:
def last_segment(self): def last_segment(self):
"""Return the last segment of readable text since last time this property was accessed.""" """Return the last segment of readable text since last time this property was accessed."""
text = self.text text = self.text
if text and text[-1] != REPLACEMENT_CHAR: segment = text[self.offset :]
segment = text[self.offset :] self.offset = len(text)
self.offset = len(text) return segment
return segment
return ""
class NaiveStreamingDetokenizer(StreamingDetokenizer): class NaiveStreamingDetokenizer(StreamingDetokenizer):
@ -132,7 +128,7 @@ class SPMStreamingDetokenizer(StreamingDetokenizer):
self.tokens = [] self.tokens = []
def _flush(self): def _flush(self):
text = self._unflushed.replace(self._sep, b" ").decode("utf-8") text = self._unflushed.replace(self._sep, b" ").decode("utf-8", "replace")
if not self.text and self.trim_space and text and text[0] == " ": if not self.text and self.trim_space and text and text[0] == " ":
text = text[1:] text = text[1:]
self.text += text self.text += text
@ -202,7 +198,7 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
if is_added or self._byte_decoder[v[0]] == 32: if is_added or self._byte_decoder[v[0]] == 32:
current_text = bytearray( current_text = bytearray(
self._byte_decoder[c] for c in self._unflushed self._byte_decoder[c] for c in self._unflushed
).decode("utf-8") ).decode("utf-8", "replace")
self.text += self._maybe_trim_space(current_text) self.text += self._maybe_trim_space(current_text)
if is_added: if is_added:
self.text += v self.text += v
@ -214,7 +210,8 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
def finalize(self): def finalize(self):
current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode( current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
"utf-8" "utf-8",
"replace",
) )
self.text += self._maybe_trim_space(current_text) self.text += self._maybe_trim_space(current_text)
self._unflushed = "" self._unflushed = ""