[mlx-lm] Use top p in server (#1144)

* use top p in server * couple other fixes
2025-12-15 09:48:54 +08:00 · 2024-12-12 11:12:21 -08:00
parent 19abf3dcaa
commit 2ba0e36683
3 changed files with 5 additions and 2 deletions
--- a/llms/mlx_lm/utils.py
+++ b/llms/mlx_lm/utils.py
@@ -299,6 +299,9 @@ def generate_step(
        prompt_processed_tokens = 0
        while y.size > prefill_step_size:
            model(y[:prefill_step_size][None], cache=prompt_cache)
+            maybe_quantize_kv_cache(
+                prompt_cache, quantized_kv_start, kv_group_size, kv_bits
+            )
            mx.eval([c.state for c in prompt_cache])
            prompt_progress_callback(prompt_processed_tokens, total_prompt_tokens)
            prompt_processed_tokens += prefill_step_size