mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-08-29 18:26:37 +08:00
Fix decoding manually added tokens
This commit is contained in:
parent
fc0674d2d8
commit
8c67480050
@ -159,6 +159,8 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
|
|||||||
|
|
||||||
def __init__(self, tokenizer):
|
def __init__(self, tokenizer):
|
||||||
|
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
|
||||||
self.clean_spaces = tokenizer.clean_up_tokenization_spaces
|
self.clean_spaces = tokenizer.clean_up_tokenization_spaces
|
||||||
|
|
||||||
# Extract the tokens in a list from id to text
|
# Extract the tokens in a list from id to text
|
||||||
@ -201,6 +203,9 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
|
|||||||
"utf-8", "replace"
|
"utf-8", "replace"
|
||||||
)
|
)
|
||||||
if is_added:
|
if is_added:
|
||||||
|
# We need to manually encode and decode the added tokens in case special characters
|
||||||
|
# used for `\n` / `\t` have been manually added in the added tokens
|
||||||
|
v = self.tokenizer.decode(self.tokenizer.encode(v))
|
||||||
text += v
|
text += v
|
||||||
if not text.endswith("\ufffd"):
|
if not text.endswith("\ufffd"):
|
||||||
self.text += self._maybe_trim_space(text)
|
self.text += self._maybe_trim_space(text)
|
||||||
|
Loading…
Reference in New Issue
Block a user