More cache improvements (#1015)

* fix rotating kv cache for chat use case * reorg + fixes to caching, unify prompt caching across types and use cases for e.g. caching during a chat * nit in chat * fix tests * fix tests * fix tests * docs * chat command * comments + docs * Define meta_state on all Cache implementations * fixes + trim_prompt_cache api * fix default model --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com>
2025-12-16 02:08:55 +08:00 · 2024-10-07 20:45:51 -07:00
parent 9bc53fc210
commit fca087be49
43 changed files with 1151 additions and 691 deletions
--- a/llms/mlx_lm/models/stablelm.py
+++ b/llms/mlx_lm/models/stablelm.py
@@ -2,7 +2,6 @@

 import math
 from dataclasses import dataclass
-from typing import Tuple

 import mlx.core as mx
 import mlx.nn as nn
@@ -198,8 +197,8 @@ class Model(nn.Module):
        self,
        x: mx.array,
        mask: mx.array = None,
-        cache: mx.array = None,
-    ) -> Tuple[mx.array, mx.array]:
+        cache=None,
+    ) -> mx.array:
        mask = create_attention_mask(x, cache)
        y = self.model(x, mask, cache)
        return self.lm_head(y)
@@ -207,11 +206,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads