More cache improvements (#1015)

* fix rotating kv cache for chat use case

* reorg + fixes to caching, unify prompt caching across types and use cases for e.g. caching during a chat

* nit in chat

* fix tests

* fix tests

* fix tests

* docs

* chat command

* comments + docs

* Define meta_state on all Cache implementations

* fixes + trim_prompt_cache api

* fix default model

---------

Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com>
This commit is contained in:
Awni Hannun
2024-10-07 20:45:51 -07:00
committed by GitHub
parent 9bc53fc210
commit fca087be49
43 changed files with 1151 additions and 691 deletions

View File

@@ -1,10 +1,10 @@
from dataclasses import dataclass
from typing import Dict, Optional
from typing import Any, Dict, Optional
import mlx.core as mx
import mlx.nn as nn
from .base import BaseModelArgs, KVCache, create_attention_mask
from .base import BaseModelArgs, create_attention_mask
from .switch_layers import SwitchGLU
@@ -77,7 +77,7 @@ class DeepseekAttention(nn.Module):
self,
x: mx.array,
mask: Optional[mx.array] = None,
cache: Optional[KVCache] = None,
cache: Optional[Any] = None,
) -> mx.array:
B, L, _ = x.shape
@@ -108,8 +108,8 @@ class DeepseekMLP(nn.Module):
def __init__(
self,
config: ModelArgs,
hidden_size: int | None = None,
intermediate_size: int | None = None,
hidden_size: Optional[int] = None,
intermediate_size: Optional[int] = None,
):
super().__init__()
self.config = config
@@ -188,7 +188,7 @@ class DeepseekDecoderLayer(nn.Module):
self,
x: mx.array,
mask: Optional[mx.array] = None,
cache: Optional[KVCache] = None,
cache: Optional[Any] = None,
) -> mx.array:
r = self.self_attn(self.input_layernorm(x), mask, cache)
h = x + r
@@ -210,7 +210,7 @@ class DeepseekModel(nn.Module):
def __call__(
self,
x: mx.array,
cache: Optional[KVCache] = None,
cache: Optional[Any] = None,
) -> mx.array:
h = self.embed_tokens(x)
mask = create_attention_mask(h, cache)
@@ -235,7 +235,7 @@ class Model(nn.Module):
def __call__(
self,
inputs: mx.array,
cache: Optional[KVCache] = None,
cache: Optional[Any] = None,
):
out = self.model(inputs, cache)
return self.lm_head(out)
@@ -256,11 +256,3 @@ class Model(nn.Module):
@property
def layers(self):
return self.model.layers
@property
def head_dim(self):
return self.args.hidden_size // self.args.num_attention_heads
@property
def n_kv_heads(self):
return self.args.num_key_value_heads