Generate: Support Prefill Prompt

python -m mlx_lm.generate \
       --model mlx-community/DeepSeek-R1-Distill-Qwen-1.5B-4bit \
       --prompt "hello" \
       --prefill-prompt "<think>\n"
This commit is contained in:
madroid 2025-02-24 12:19:49 +08:00
parent 09b641aaa7
commit 431ece6c5b

View File

@ -60,6 +60,11 @@ def setup_arg_parser():
default=DEFAULT_PROMPT,
help="Message to be processed by the model ('-' reads from stdin)",
)
parser.add_argument(
"--prefill-prompt",
default=None,
help="Prefill prompt to be used for the chat template",
)
parser.add_argument(
"--max-tokens",
"-m",
@ -219,12 +224,21 @@ def main():
messages = []
messages.append({"role": "user", "content": prompt})
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
**template_kwargs,
)
if args.prefill_prompt is not None:
messages.append({"role": "assistant", "content": args.prefill_prompt})
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
continue_final_message=True,
**template_kwargs,
)
else:
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
**template_kwargs,
)
# Treat the prompt as a suffix assuming that the prefix is in the
# stored kv cache.