reorg + fixes to caching, unify prompt caching across types and use cases for e.g. caching during a chat

2025-09-20 03:48:07 +08:00 · 2024-10-05 14:49:39 -07:00
parent ed060a7c5c
commit 782f5a71b7
40 changed files with 824 additions and 691 deletions
--- a/llms/mlx_lm/models/phi.py
+++ b/llms/mlx_lm/models/phi.py
@@ -162,19 +162,11 @@ class Model(nn.Module):
    def __call__(
        self,
        x: mx.array,
-        cache: mx.array = None,
-    ) -> Tuple[mx.array, mx.array]:
+        cache=None,
+    ) -> mx.array:
        y = self.model(x, cache)
        return self.lm_head(y)

    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads