Make sure to use UTF-8 when loading tokenizer.json (#1340)

This commit is contained in:
Mirko Nasato 2025-03-13 02:17:14 +00:00 committed by GitHub
parent 4c3df00162
commit 3e5baf583b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -352,7 +352,7 @@ def load_tokenizer(model_path, tokenizer_config_extra={}, eos_token_ids=None):
tokenizer_file = model_path / "tokenizer.json"
if tokenizer_file.exists():
with open(tokenizer_file, "r") as fid:
with open(tokenizer_file, "r", encoding="utf-8") as fid:
tokenizer_content = json.load(fid)
if "decoder" in tokenizer_content:
if _is_spm_decoder(tokenizer_content["decoder"]):