fix encoding with special tokens + chat template (#1189)

2025-12-15 09:48:54 +08:00 · 2025-01-03 10:50:59 -08:00
parent 3a58c36109
commit c4833a2f55
13 changed files with 95 additions and 97 deletions
--- a/llms/mlx_lm/cache_prompt.py
+++ b/llms/mlx_lm/cache_prompt.py
@@ -110,29 +110,17 @@ def main():
        if tokenizer.chat_template is None:
            tokenizer.chat_template = tokenizer.default_chat_template

-    if not args.ignore_chat_template and (
-        hasattr(tokenizer, "apply_chat_template")
-        and tokenizer.chat_template is not None
-    ):
+    if not args.ignore_chat_template and tokenizer.chat_template is not None:
        messages = [{"role": "user", "content": args.prompt}]
        prompt = tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
+            messages, add_generation_prompt=False, continue_final_message=True
        )

-        # Treat the prompt as a prefix assuming that the suffix will be
-        # provided at generation time.
-        test_prompt = tokenizer.apply_chat_template(
-            [{"role": "user", "content": "<query>"}],
-            tokenize=False,
-            add_generation_prompt=True,
-        )
-        n = len(test_prompt) - test_prompt.index("<query>") - len("<query>")
-        prompt = prompt[:-n]
    else:
-        prompt = args.prompt
+        prompt = tokenizer.encode(args.prompt)

    cache = make_prompt_cache(model, args.max_kv_size)
-    y = mx.array(tokenizer.encode(prompt))
+    y = mx.array(prompt)

    # Process the prompt
    start = time.time()