mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-08-29 18:17:07 +08:00
no lag bpe
This commit is contained in:
parent
0fb0b6b4e6
commit
0007b019d9
@ -127,23 +127,23 @@ class SPMStreamingDetokenizer(StreamingDetokenizer):
|
|||||||
self.text = ""
|
self.text = ""
|
||||||
self.tokens = []
|
self.tokens = []
|
||||||
|
|
||||||
def _flush(self):
|
def _try_flush(self, force=False):
|
||||||
text = self._unflushed.replace(self._sep, b" ").decode("utf-8", "replace")
|
text = self._unflushed.replace(self._sep, b" ").decode("utf-8", "replace")
|
||||||
|
if not force and text.endswith("\ufffd"):
|
||||||
|
return
|
||||||
if not self.text and self.trim_space and text and text[0] == " ":
|
if not self.text and self.trim_space and text and text[0] == " ":
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
self.text += text
|
self.text += text
|
||||||
|
self._unflushed = b""
|
||||||
|
|
||||||
def add_token(self, token):
|
def add_token(self, token):
|
||||||
self.tokens.append(token)
|
self.tokens.append(token)
|
||||||
v = self.tokenmap[token]
|
v = self.tokenmap[token]
|
||||||
if v.startswith(self._sep):
|
self._unflushed += v
|
||||||
self._flush()
|
self._try_flush()
|
||||||
self._unflushed = v
|
|
||||||
else:
|
|
||||||
self._unflushed += v
|
|
||||||
|
|
||||||
def finalize(self):
|
def finalize(self):
|
||||||
self._flush()
|
self._try_flush(force=True)
|
||||||
self._unflushed = b""
|
self._unflushed = b""
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user