no lag bpe

This commit is contained in:
Awni Hannun 2024-12-17 09:45:42 -08:00
parent 0fb0b6b4e6
commit 0007b019d9

View File

@ -127,23 +127,23 @@ class SPMStreamingDetokenizer(StreamingDetokenizer):
self.text = "" self.text = ""
self.tokens = [] self.tokens = []
def _flush(self): def _try_flush(self, force=False):
text = self._unflushed.replace(self._sep, b" ").decode("utf-8", "replace") text = self._unflushed.replace(self._sep, b" ").decode("utf-8", "replace")
if not force and text.endswith("\ufffd"):
return
if not self.text and self.trim_space and text and text[0] == " ": if not self.text and self.trim_space and text and text[0] == " ":
text = text[1:] text = text[1:]
self.text += text self.text += text
self._unflushed = b""
def add_token(self, token): def add_token(self, token):
self.tokens.append(token) self.tokens.append(token)
v = self.tokenmap[token] v = self.tokenmap[token]
if v.startswith(self._sep): self._unflushed += v
self._flush() self._try_flush()
self._unflushed = v
else:
self._unflushed += v
def finalize(self): def finalize(self):
self._flush() self._try_flush(force=True)
self._unflushed = b"" self._unflushed = b""