mlx-examples/llms/tests/test_generate.py

# Copyright © 2024 Apple Inc.

import unittest
from typing import List

from mlx_lm.sample_utils import make_logits_processors
from mlx_lm.utils import (
    GenerationResponse,
    generate,
    load,
    make_sampler,
    stream_generate,
)


class TestGenerate(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        cls.HF_MODEL_PATH = "mlx-community/Qwen1.5-0.5B-Chat-4bit"
        cls.model, cls.tokenizer = load(cls.HF_MODEL_PATH)

    def test_generate(self):
        # Simple test that generation runs
        text = generate(
            self.model, self.tokenizer, "hello", max_tokens=5, verbose=False
        )

    def test_generate_with_logit_bias(self):
        logit_bias = {0: 2000.0, 1: -20.0}
        text = generate(
            self.model,
            self.tokenizer,
            "hello",
            max_tokens=5,
            logits_processors=make_logits_processors(logit_bias),
            verbose=False,
        )
        self.assertEqual(text, "!!!!!")

    def test_generate_with_processor(self):
        init_toks = self.tokenizer.encode("hello")

        all_toks = None

        def logits_processor(toks, logits):
            nonlocal all_toks
            all_toks = toks
            return logits

        generate(
            self.model,
            self.tokenizer,
            "hello",
            max_tokens=5,
            verbose=False,
            logits_processors=[logits_processor],
        )
        self.assertEqual(len(all_toks), len(init_toks) + 5)

    def test_stream_generate_speculative(self):
        # Use same model as draft model, this is not a speed test
        draft_model, _ = load(self.HF_MODEL_PATH)

        results: List[GenerationResponse] = []
        drafted: List[bool] = []

        # make a determinate sampler
        sampler = make_sampler(temp=0.0)

        for generation_result in stream_generate(
            model=self.model,
            tokenizer=self.tokenizer,
            prompt="hello",
            max_tokens=5,
            draft_model=draft_model,
            num_draft_tokens=2,
            sampler=sampler,
        ):
            drafted.append(generation_result.from_draft)
            results.append(generation_result)

        self.assertEqual(len(results), 5)
        # since num_draft_tokens is 2 and draft model is the same, the
        # first 2 generations should be drafts, the third should come
        # from the target model, and last two should be drafts
        self.assertEqual(drafted, [True, True, False, True, True])


if __name__ == "__main__":
    unittest.main()
Add logits_processor option to generate_step function (#983) * Add logits_processor option for the generation as in huggingface transformers library * concatenation correction * Rename the tokens variable for clarity * remove the logit_bias argument from generate_step method * fix the variable name * nits + test * test * add back logit bias + test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-09-29 01:08:49 +08:00			`# Copyright © 2024 Apple Inc.`

			`import unittest`
Add "from_draft" to GenerationResponse (#1272) * Add from_draft field in GenerationResponse * Cleanup * Re-work for minimal changes, add test * Fix comment 2025-02-12 07:41:02 +08:00			`from typing import List`
Add logits_processor option to generate_step function (#983) * Add logits_processor option for the generation as in huggingface transformers library * concatenation correction * Rename the tokens variable for clarity * remove the logit_bias argument from generate_step method * fix the variable name * nits + test * test * add back logit bias + test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-09-29 01:08:49 +08:00
Generation refactor: part 2 (#1099) * unify with stream_generate * fixes * nit * some cleanup, warnings, tests * fix test + faster min p + test * version 2024-11-24 03:47:06 +08:00			`from mlx_lm.sample_utils import make_logits_processors`
Add "from_draft" to GenerationResponse (#1272) * Add from_draft field in GenerationResponse * Cleanup * Re-work for minimal changes, add test * Fix comment 2025-02-12 07:41:02 +08:00			`from mlx_lm.utils import (`
			`GenerationResponse,`
			`generate,`
			`load,`
			`make_sampler,`
			`stream_generate,`
			`)`
Add logits_processor option to generate_step function (#983) * Add logits_processor option for the generation as in huggingface transformers library * concatenation correction * Rename the tokens variable for clarity * remove the logit_bias argument from generate_step method * fix the variable name * nits + test * test * add back logit bias + test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-09-29 01:08:49 +08:00

			`class TestGenerate(unittest.TestCase):`

			`@classmethod`
			`def setUpClass(cls):`
Add "from_draft" to GenerationResponse (#1272) * Add from_draft field in GenerationResponse * Cleanup * Re-work for minimal changes, add test * Fix comment 2025-02-12 07:41:02 +08:00			`cls.HF_MODEL_PATH = "mlx-community/Qwen1.5-0.5B-Chat-4bit"`
			`cls.model, cls.tokenizer = load(cls.HF_MODEL_PATH)`
Add logits_processor option to generate_step function (#983) * Add logits_processor option for the generation as in huggingface transformers library * concatenation correction * Rename the tokens variable for clarity * remove the logit_bias argument from generate_step method * fix the variable name * nits + test * test * add back logit bias + test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-09-29 01:08:49 +08:00
			`def test_generate(self):`
			`# Simple test that generation runs`
			`text = generate(`
			`self.model, self.tokenizer, "hello", max_tokens=5, verbose=False`
			`)`

			`def test_generate_with_logit_bias(self):`
			`logit_bias = {0: 2000.0, 1: -20.0}`
			`text = generate(`
			`self.model,`
			`self.tokenizer,`
			`"hello",`
			`max_tokens=5,`
Generation refactor: part 2 (#1099) * unify with stream_generate * fixes * nit * some cleanup, warnings, tests * fix test + faster min p + test * version 2024-11-24 03:47:06 +08:00			`logits_processors=make_logits_processors(logit_bias),`
Add logits_processor option to generate_step function (#983) * Add logits_processor option for the generation as in huggingface transformers library * concatenation correction * Rename the tokens variable for clarity * remove the logit_bias argument from generate_step method * fix the variable name * nits + test * test * add back logit bias + test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-09-29 01:08:49 +08:00			`verbose=False,`
			`)`
			`self.assertEqual(text, "!!!!!")`

			`def test_generate_with_processor(self):`
			`init_toks = self.tokenizer.encode("hello")`

			`all_toks = None`

			`def logits_processor(toks, logits):`
			`nonlocal all_toks`
			`all_toks = toks`
			`return logits`

			`generate(`
			`self.model,`
			`self.tokenizer,`
			`"hello",`
			`max_tokens=5,`
			`verbose=False,`
[MLX LM] Sampler refactor + a few improvements (#1094) * starting * refactor sampler/processor and a few improvements * fix stream * fix stream generate * fix eos handling in stream generate 2024-11-08 08:15:24 +08:00			`logits_processors=[logits_processor],`
Add logits_processor option to generate_step function (#983) * Add logits_processor option for the generation as in huggingface transformers library * concatenation correction * Rename the tokens variable for clarity * remove the logit_bias argument from generate_step method * fix the variable name * nits + test * test * add back logit bias + test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-09-29 01:08:49 +08:00			`)`
			`self.assertEqual(len(all_toks), len(init_toks) + 5)`

Add "from_draft" to GenerationResponse (#1272) * Add from_draft field in GenerationResponse * Cleanup * Re-work for minimal changes, add test * Fix comment 2025-02-12 07:41:02 +08:00			`def test_stream_generate_speculative(self):`
			`# Use same model as draft model, this is not a speed test`
			`draft_model, _ = load(self.HF_MODEL_PATH)`

			`results: List[GenerationResponse] = []`
			`drafted: List[bool] = []`

			`# make a determinate sampler`
			`sampler = make_sampler(temp=0.0)`

			`for generation_result in stream_generate(`
			`model=self.model,`
			`tokenizer=self.tokenizer,`
			`prompt="hello",`
			`max_tokens=5,`
			`draft_model=draft_model,`
			`num_draft_tokens=2,`
			`sampler=sampler,`
			`):`
			`drafted.append(generation_result.from_draft)`
			`results.append(generation_result)`

			`self.assertEqual(len(results), 5)`
			`# since num_draft_tokens is 2 and draft model is the same, the`
			`# first 2 generations should be drafts, the third should come`
			`# from the target model, and last two should be drafts`
			`self.assertEqual(drafted, [True, True, False, True, True])`

Add logits_processor option to generate_step function (#983) * Add logits_processor option for the generation as in huggingface transformers library * concatenation correction * Rename the tokens variable for clarity * remove the logit_bias argument from generate_step method * fix the variable name * nits + test * test * add back logit bias + test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-09-29 01:08:49 +08:00
			`if __name__ == "__main__":`
			`unittest.main()`