mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-12-15 09:48:54 +08:00
Support for multiple EOS tokens (#1141)
* Support for multiple EOS tokens * Change _eos_token_ids type from list to set * Remove model_config & add eos_token_id * nits --------- Co-authored-by: Awni Hannun <awni@apple.com>
This commit is contained in:
@@ -254,21 +254,33 @@ class TokenizerWrapper:
|
||||
huggingface tokenizer.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer, detokenizer_class=NaiveStreamingDetokenizer):
|
||||
def __init__(
|
||||
self, tokenizer, detokenizer_class=NaiveStreamingDetokenizer, eos_token_ids=None
|
||||
):
|
||||
self._tokenizer = tokenizer
|
||||
self._detokenizer = detokenizer_class(tokenizer)
|
||||
self._eos_token_ids = (
|
||||
set(eos_token_ids)
|
||||
if eos_token_ids is not None
|
||||
else {tokenizer.eos_token_id}
|
||||
)
|
||||
|
||||
def __getattr__(self, attr):
|
||||
if attr == "detokenizer":
|
||||
return self._detokenizer
|
||||
elif attr == "eos_token_ids":
|
||||
return self._eos_token_ids
|
||||
elif attr.startswith("_"):
|
||||
return self.__getattribute__(attr)
|
||||
else:
|
||||
return getattr(self._tokenizer, attr)
|
||||
|
||||
def __setattr__(self, attr, value):
|
||||
if attr == "detokenizer":
|
||||
raise AttributeError("Cannot set the detokenizer.")
|
||||
if attr in {"detokenizer", "eos_token_ids"}:
|
||||
if attr == "detokenizer":
|
||||
raise AttributeError("Cannot set the detokenizer.")
|
||||
elif attr == "eos_token_ids":
|
||||
self._eos_token_ids = set(value) if value is not None else set()
|
||||
elif attr.startswith("_"):
|
||||
super().__setattr__(attr, value)
|
||||
else:
|
||||
@@ -315,7 +327,7 @@ def _is_bpe_decoder(decoder):
|
||||
return isinstance(decoder, dict) and decoder.get("type", None) == "ByteLevel"
|
||||
|
||||
|
||||
def load_tokenizer(model_path, tokenizer_config_extra={}):
|
||||
def load_tokenizer(model_path, tokenizer_config_extra={}, eos_token_ids=None):
|
||||
"""Load a huggingface tokenizer and try to infer the type of streaming
|
||||
detokenizer to use.
|
||||
|
||||
@@ -336,7 +348,10 @@ def load_tokenizer(model_path, tokenizer_config_extra={}):
|
||||
elif _is_bpe_decoder(tokenizer_content["decoder"]):
|
||||
detokenizer_class = BPEStreamingDetokenizer
|
||||
|
||||
if isinstance(eos_token_ids, int):
|
||||
eos_token_ids = [eos_token_ids]
|
||||
return TokenizerWrapper(
|
||||
AutoTokenizer.from_pretrained(model_path, **tokenizer_config_extra),
|
||||
detokenizer_class,
|
||||
eos_token_ids=eos_token_ids,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user