mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-08-30 02:53:41 +08:00
Change the last_segment logic in tokenizer utils
This commit is contained in:
parent
b444acfd69
commit
6ef5ca4ce5
@ -3,8 +3,6 @@ from functools import partial
|
|||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
REPLACEMENT_CHAR = "\ufffd"
|
|
||||||
|
|
||||||
|
|
||||||
class StreamingDetokenizer:
|
class StreamingDetokenizer:
|
||||||
"""The streaming detokenizer interface so that we can detokenize one token at a time.
|
"""The streaming detokenizer interface so that we can detokenize one token at a time.
|
||||||
@ -51,11 +49,9 @@ class StreamingDetokenizer:
|
|||||||
def last_segment(self):
|
def last_segment(self):
|
||||||
"""Return the last segment of readable text since last time this property was accessed."""
|
"""Return the last segment of readable text since last time this property was accessed."""
|
||||||
text = self.text
|
text = self.text
|
||||||
if text and text[-1] != REPLACEMENT_CHAR:
|
|
||||||
segment = text[self.offset :]
|
segment = text[self.offset :]
|
||||||
self.offset = len(text)
|
self.offset = len(text)
|
||||||
return segment
|
return text
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
class NaiveStreamingDetokenizer(StreamingDetokenizer):
|
class NaiveStreamingDetokenizer(StreamingDetokenizer):
|
||||||
|
Loading…
Reference in New Issue
Block a user