From d8e6996254fa89b60b4a01731f7f84602c170308 Mon Sep 17 00:00:00 2001 From: Mirko Nasato Date: Wed, 12 Mar 2025 14:24:32 +0000 Subject: [PATCH] Make sure to use UTF-8 when loading tokenizer.json --- llms/mlx_lm/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llms/mlx_lm/tokenizer_utils.py b/llms/mlx_lm/tokenizer_utils.py index de9d5324..b33d504b 100644 --- a/llms/mlx_lm/tokenizer_utils.py +++ b/llms/mlx_lm/tokenizer_utils.py @@ -352,7 +352,7 @@ def load_tokenizer(model_path, tokenizer_config_extra={}, eos_token_ids=None): tokenizer_file = model_path / "tokenizer.json" if tokenizer_file.exists(): - with open(tokenizer_file, "r") as fid: + with open(tokenizer_file, "r", encoding="utf-8") as fid: tokenizer_content = json.load(fid) if "decoder" in tokenizer_content: if _is_spm_decoder(tokenizer_content["decoder"]):