Add support for cohere2 (#1157)

* add support for cohere2 * revert to act_fn to silu * fix tests and sliding window attention * add tests * add to tuner * fix sliding window * add coauthor :) Co-authored-by: n8programs <43304488+N8python@users.noreply.github.com> * Add rotating kvcache to save space * some nits * style * nits --------- Co-authored-by: n8programs <43304488+N8python@users.noreply.github.com> Co-authored-by: N8 <n8@n8programs.com> Co-authored-by: Awni Hannun <awni@apple.com>
2025-12-15 01:42:31 +08:00 · 2024-12-16 17:01:03 +01:00
parent fc0674d2d8
commit dfa4dd6c93
4 changed files with 228 additions and 3 deletions
--- a/llms/mlx_lm/utils.py
+++ b/llms/mlx_lm/utils.py
@@ -187,9 +187,10 @@ def maybe_quantize_kv_cache(prompt_cache, quantized_kv_start, kv_group_size, kv_
        and prompt_cache[0].offset > quantized_kv_start
    ):
        for i in range(len(prompt_cache)):
-            prompt_cache[i] = prompt_cache[i].to_quantized(
-                group_size=kv_group_size, bits=kv_bits
-            )
+            if isinstance(prompt_cache[i], cache.KVCache):
+                prompt_cache[i] = prompt_cache[i].to_quantized(
+                    group_size=kv_group_size, bits=kv_bits
+                )


 def generate_step(