mlx-examples/llms/mlx_lm/generate.py

import argparse
import time

import mlx.core as mx

from .utils import generate_step, load

DEFAULT_MODEL_PATH = "mlx_model"
DEFAULT_PROMPT = "hello"
DEFAULT_MAX_TOKENS = 100
DEFAULT_TEMP = 0.6
DEFAULT_SEED = 0


def setup_arg_parser():
    """Set up and return the argument parser."""
    parser = argparse.ArgumentParser(description="LLM inference script")
    parser.add_argument(
        "--model",
        type=str,
        default="mlx_model",
        help="The path to the local model directory or Hugging Face repo.",
    )
    parser.add_argument(
        "--trust-remote-code",
        action="store_true",
        help="Enable trusting remote code for tokenizer",
    )
    parser.add_argument(
        "--eos-token",
        type=str,
        default=None,
        help="End of sequence token for tokenizer",
    )
    parser.add_argument(
        "--prompt", default=DEFAULT_PROMPT, help="Message to be processed by the model"
    )
    parser.add_argument(
        "--max-tokens",
        "-m",
        type=int,
        default=DEFAULT_MAX_TOKENS,
        help="Maximum number of tokens to generate",
    )
    parser.add_argument(
        "--temp", type=float, default=DEFAULT_TEMP, help="Sampling temperature"
    )
    parser.add_argument("--seed", type=int, default=DEFAULT_SEED, help="PRNG seed")
    parser.add_argument(
        "--ignore-chat-template",
        action="store_true",
        help="Use the raw prompt without the tokenizer's chat template.",
    )
    return parser


def main(args):
    mx.random.seed(args.seed)

    # Building tokenizer_config
    tokenizer_config = {"trust_remote_code": True if args.trust_remote_code else None}
    if args.eos_token is not None:
        tokenizer_config["eos_token"] = args.eos_token

    model, tokenizer = load(args.model, tokenizer_config=tokenizer_config)

    if not args.ignore_chat_template and (
        hasattr(tokenizer, "apply_chat_template")
        and tokenizer.chat_template is not None
    ):
        messages = [{"role": "user", "content": args.prompt}]
        prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
    else:
        prompt = args.prompt

    print("=" * 10)
    print("Prompt:", prompt)
    prompt = tokenizer.encode(prompt)
    prompt = mx.array(prompt)
    tic = time.time()
    tokens = []
    skip = 0
    for token, n in zip(
        generate_step(prompt, model, args.temp), range(args.max_tokens)
    ):
        if token == tokenizer.eos_token_id:
            break
        if n == 0:
            prompt_time = time.time() - tic
            tic = time.time()
        tokens.append(token.item())
        s = tokenizer.decode(tokens)
        print(s[skip:], end="", flush=True)
        skip = len(s)
    print(tokenizer.decode(tokens)[skip:], flush=True)
    gen_time = time.time() - tic
    print("=" * 10)
    if len(tokens) == 0:
        print("No tokens generated for this prompt")
        return
    prompt_tps = prompt.size / prompt_time
    gen_tps = (len(tokens) - 1) / gen_time
    print(f"Prompt: {prompt_tps:.3f} tokens-per-sec")
    print(f"Generation: {gen_tps:.3f} tokens-per-sec")


if __name__ == "__main__":
    parser = setup_arg_parser()
    args = parser.parse_args()
    main(args)
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`import argparse`
			`import time`

			`import mlx.core as mx`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00
			`from .utils import generate_step, load`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`DEFAULT_MODEL_PATH = "mlx_model"`
			`DEFAULT_PROMPT = "hello"`
			`DEFAULT_MAX_TOKENS = 100`
			`DEFAULT_TEMP = 0.6`
			`DEFAULT_SEED = 0`


			`def setup_arg_parser():`
			`"""Set up and return the argument parser."""`
			`parser = argparse.ArgumentParser(description="LLM inference script")`
			`parser.add_argument(`
			`"--model",`
			`type=str,`
			`default="mlx_model",`
			`help="The path to the local model directory or Hugging Face repo.",`
			`)`
refactor(qwen): moving qwen into mlx-lm (#312) * refactor(qwen): moving qwen into mlx-lm * chore: update doc * chore: fix type hint * add qwen model support in convert * chore: fix doc * chore: only load model in quantize_model * chore: make the convert script only copy tokenizer files instead of load it and save * chore: update docstring * chore: remove unnecessary try catch * chore: clean up for tokenizer and update transformers 4.37 * nits in README --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-23 07:00:07 +08:00			`parser.add_argument(`
			`"--trust-remote-code",`
			`action="store_true",`
			`help="Enable trusting remote code for tokenizer",`
			`)`
			`parser.add_argument(`
			`"--eos-token",`
			`type=str,`
			`default=None,`
			`help="End of sequence token for tokenizer",`
			`)`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`parser.add_argument(`
			`"--prompt", default=DEFAULT_PROMPT, help="Message to be processed by the model"`
			`)`
			`parser.add_argument(`
			`"--max-tokens",`
			`"-m",`
			`type=int,`
			`default=DEFAULT_MAX_TOKENS,`
			`help="Maximum number of tokens to generate",`
			`)`
			`parser.add_argument(`
			`"--temp", type=float, default=DEFAULT_TEMP, help="Sampling temperature"`
			`)`
			`parser.add_argument("--seed", type=int, default=DEFAULT_SEED, help="PRNG seed")`
add an option to apply the tokenizer chat template (#338) * add an option to apply the tokenizer chat template * fix the option to apply the tokenizer chat template * better error messages for chat template issues * apply the chat template by default when possible * nit in comment' * rebase --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-23 11:52:42 +08:00			`parser.add_argument(`
			`"--ignore-chat-template",`
			`action="store_true",`
			`help="Use the raw prompt without the tokenizer's chat template.",`
			`)`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`return parser`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00

refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`def main(args):`
			`mx.random.seed(args.seed)`
refactor(qwen): moving qwen into mlx-lm (#312) * refactor(qwen): moving qwen into mlx-lm * chore: update doc * chore: fix type hint * add qwen model support in convert * chore: fix doc * chore: only load model in quantize_model * chore: make the convert script only copy tokenizer files instead of load it and save * chore: update docstring * chore: remove unnecessary try catch * chore: clean up for tokenizer and update transformers 4.37 * nits in README --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-23 07:00:07 +08:00
			`# Building tokenizer_config`
			`tokenizer_config = {"trust_remote_code": True if args.trust_remote_code else None}`
			`if args.eos_token is not None:`
			`tokenizer_config["eos_token"] = args.eos_token`

			`model, tokenizer = load(args.model, tokenizer_config=tokenizer_config)`
add an option to apply the tokenizer chat template (#338) * add an option to apply the tokenizer chat template * fix the option to apply the tokenizer chat template * better error messages for chat template issues * apply the chat template by default when possible * nit in comment' * rebase --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-23 11:52:42 +08:00
			`if not args.ignore_chat_template and (`
			`hasattr(tokenizer, "apply_chat_template")`
			`and tokenizer.chat_template is not None`
			`):`
			`messages = [{"role": "user", "content": args.prompt}]`
			`prompt = tokenizer.apply_chat_template(`
			`messages, tokenize=False, add_generation_prompt=True`
			`)`
			`else:`
			`prompt = args.prompt`

refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`print("=" * 10)`
add an option to apply the tokenizer chat template (#338) * add an option to apply the tokenizer chat template * fix the option to apply the tokenizer chat template * better error messages for chat template issues * apply the chat template by default when possible * nit in comment' * rebase --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-23 11:52:42 +08:00			`print("Prompt:", prompt)`
			`prompt = tokenizer.encode(prompt)`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`prompt = mx.array(prompt)`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`tic = time.time()`
			`tokens = []`
			`skip = 0`
Mlx llm package (#301) * fix converter * add recursive files * remove gitignore * remove gitignore * add packages properly * read me update * remove dup readme * relative * fix convert * fix community name * fix url * version 2024-01-13 02:25:56 +08:00			`for token, n in zip(`
			`generate_step(prompt, model, args.temp), range(args.max_tokens)`
			`):`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`if token == tokenizer.eos_token_id:`
			`break`
			`if n == 0:`
			`prompt_time = time.time() - tic`
			`tic = time.time()`
			`tokens.append(token.item())`
			`s = tokenizer.decode(tokens)`
			`print(s[skip:], end="", flush=True)`
			`skip = len(s)`
			`print(tokenizer.decode(tokens)[skip:], flush=True)`
			`gen_time = time.time() - tic`
			`print("=" * 10)`
Handle receiving 0 tokens gracefully (#231) * handle 0 tokens gracefully * Formatting * Move no token check to statistics section 2024-01-05 11:14:13 +08:00			`if len(tokens) == 0:`
			`print("No tokens generated for this prompt")`
			`return`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`prompt_tps = prompt.size / prompt_time`
			`gen_tps = (len(tokens) - 1) / gen_time`
			`print(f"Prompt: {prompt_tps:.3f} tokens-per-sec")`
			`print(f"Generation: {gen_tps:.3f} tokens-per-sec")`


			`if __name__ == "__main__":`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`parser = setup_arg_parser()`
Support Hugging Face models (#215) * support hf direct models 2024-01-04 07:13:26 +08:00			`args = parser.parse_args()`
refactor(hf_llm): moving phi2 example into hf_llm (#293) * refactor: moving phi2 example into hf_llm * chore: clean up * chore: update phi2 model args so it can load args from config * fix phi2 + nits + readme * allow any HF repo, update README * fix bug in llama --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-12 04:29:12 +08:00			`main(args)`