Generate: Support Prefill Prompt

python -m mlx_lm.generate \ --model mlx-community/DeepSeek-R1-Distill-Qwen-1.5B-4bit \ --prompt "hello" \ --prefill-prompt "<think>\n"
2025-08-28 07:54:24 +08:00 · 2025-02-24 12:19:49 +08:00 · 2025-02-24 12:19:49 +08:00 · 431ece6c5b
commit 431ece6c5b
parent 09b641aaa7
1 changed files with 20 additions and 6 deletions
--- a/llms/mlx_lm/generate.py
+++ b/llms/mlx_lm/generate.py
@ -60,6 +60,11 @@ def setup_arg_parser():
        default=DEFAULT_PROMPT,
        help="Message to be processed by the model ('-' reads from stdin)",
    )
+    parser.add_argument(
+        "--prefill-prompt",
+        default=None,
+        help="Prefill prompt to be used for the chat template",
+    )
    parser.add_argument(
        "--max-tokens",
        "-m",
@ -219,12 +224,21 @@ def main():
            messages = []
        messages.append({"role": "user", "content": prompt})

-        prompt = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
-            **template_kwargs,
-        )
+        if args.prefill_prompt is not None:
+            messages.append({"role": "assistant", "content": args.prefill_prompt})
+            prompt = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                continue_final_message=True,
+                **template_kwargs,
+            )
+        else:
+            prompt = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                **template_kwargs,
+            )

        # Treat the prompt as a suffix assuming that the prefix is in the
        # stored kv cache.