diff --git a/llms/mlx_lm/cache_prompt.py b/llms/mlx_lm/cache_prompt.py index efdbaf13..7bb06411 100644 --- a/llms/mlx_lm/cache_prompt.py +++ b/llms/mlx_lm/cache_prompt.py @@ -160,7 +160,7 @@ def main(): max_msg_len = max(max_msg_len, len(msg)) print(msg + " " * (max_msg_len - len(msg)), end="", flush=True) - cache = maybe_quantize_kv_cache( + maybe_quantize_kv_cache( cache, args.quantized_kv_start, args.kv_group_size, args.kv_bits ) diff --git a/llms/mlx_lm/utils.py b/llms/mlx_lm/utils.py index 1f5dd405..71b85861 100644 --- a/llms/mlx_lm/utils.py +++ b/llms/mlx_lm/utils.py @@ -165,10 +165,10 @@ def maybe_quantize_kv_cache(prompt_cache, quantized_kv_start, kv_group_size, kv_ and not isinstance(prompt_cache[0], cache.QuantizedKVCache) and prompt_cache[0].offset > quantized_kv_start ): - return [ - c.to_quantized(group_size=kv_group_size, bits=kv_bits) for c in prompt_cache - ] - return prompt_cache + for i in range(len(prompt_cache)): + prompt_cache[i] = prompt_cache[i].to_quantized( + group_size=kv_group_size, bits=kv_bits + ) def generate_step( @@ -290,7 +290,7 @@ def generate_step( for processor in logits_processor: logits = processor(tokens, logits) - prompt_cache = maybe_quantize_kv_cache( + maybe_quantize_kv_cache( prompt_cache, quantized_kv_start, kv_group_size, kv_bits )