Put prompt processing in same stream (#1122)

* put prompt processing in same stream * patch
2025-08-09 10:26:38 +08:00 · 2024-11-25 09:47:00 -08:00 · 2024-11-25 09:47:00 -08:00 · cfc29c29f4
commit cfc29c29f4
parent a5e173802e
2 changed files with 8 additions and 7 deletions
--- a/llms/mlx_lm/_version.py
+++ b/llms/mlx_lm/_version.py
@ -1,3 +1,3 @@
 # Copyright © 2023-2024 Apple Inc.
-__version__ = "0.20.0"
+__version__ = "0.20.1"
--- a/llms/mlx_lm/utils.py
+++ b/llms/mlx_lm/utils.py
@ -274,6 +274,7 @@ def generate_step(
            y = sampler(logprobs)
            return y, logprobs.squeeze(0)
    with mx.stream(generation_stream):
        while y.size > prefill_step_size:
            model(y[:prefill_step_size][None], cache=prompt_cache)
            mx.eval([c.state for c in prompt_cache])
`@ -1,3 +1,3 @@`
	`# Copyright © 2023-2024 Apple Inc.`	`# Copyright © 2023-2024 Apple Inc.`

	`__version__ = "0.20.0"`	`__version__ = "0.20.1"`