fix spm decoder multi-byte (#1092)

This commit is contained in:
Awni Hannun
2024-11-05 06:06:26 -08:00
committed by GitHub
parent 4394633ce0
commit 6fd1f70f73
2 changed files with 20 additions and 23 deletions

View File

@@ -42,6 +42,9 @@ class TestTokenizers(unittest.TestCase):
text += detokenizer.last_segment
self.assertEqual(text, expected_text)
tokens = tokenizer.encode("こんにちは私の名前はAI")
check(tokens)
tokens = tokenizer.encode("a ,b")
check(tokens)