From 2024181b7c4f43131f57f9d9a86d8009f57b6a6d Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Thu, 12 Dec 2024 10:51:38 -0800 Subject: [PATCH] couple other fixes --- llms/mlx_lm/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llms/mlx_lm/utils.py b/llms/mlx_lm/utils.py index 493c1c42..b87f5a24 100644 --- a/llms/mlx_lm/utils.py +++ b/llms/mlx_lm/utils.py @@ -299,6 +299,9 @@ def generate_step( prompt_processed_tokens = 0 while y.size > prefill_step_size: model(y[:prefill_step_size][None], cache=prompt_cache) + maybe_quantize_kv_cache( + prompt_cache, quantized_kv_start, kv_group_size, kv_bits + ) mx.eval([c.state for c in prompt_cache]) prompt_progress_callback(prompt_processed_tokens, total_prompt_tokens) prompt_processed_tokens += prefill_step_size