mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-08-29 09:56:24 +08:00
Fix decoding manually added tokens
This commit is contained in:
parent
fc0674d2d8
commit
8c67480050
@ -159,6 +159,8 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
|
||||
|
||||
def __init__(self, tokenizer):
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
self.clean_spaces = tokenizer.clean_up_tokenization_spaces
|
||||
|
||||
# Extract the tokens in a list from id to text
|
||||
@ -201,6 +203,9 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
|
||||
"utf-8", "replace"
|
||||
)
|
||||
if is_added:
|
||||
# We need to manually encode and decode the added tokens in case special characters
|
||||
# used for `\n` / `\t` have been manually added in the added tokens
|
||||
v = self.tokenizer.decode(self.tokenizer.encode(v))
|
||||
text += v
|
||||
if not text.endswith("\ufffd"):
|
||||
self.text += self._maybe_trim_space(text)
|
||||
|
Loading…
Reference in New Issue
Block a user