Add support for deepseek coder v2 lite (#882)

* feat: add support for deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct

* fix softmax + some cleanup

* more nits

* fix rope

* fix original_max_position_embeddings in rope

* fix original_max_position_embeddings in rope config

* add group greedy

---------

Co-authored-by: Awni Hannun <awni@apple.com>
This commit is contained in:
Anchen
2024-07-18 00:23:28 +10:00
committed by GitHub
parent f0c6c6e226
commit 561dcf5643
2 changed files with 478 additions and 4 deletions

View File

@@ -15,7 +15,12 @@ class KVCache:
def __init__(self, head_dim, n_kv_heads):
self.n_kv_heads = n_kv_heads
self.head_dim = head_dim
if isinstance(head_dim, int):
self.k_head_dim = self.v_head_dim = head_dim
elif isinstance(head_dim, tuple) and len(head_dim) == 2:
self.k_head_dim, self.v_head_dim = head_dim
else:
raise ValueError("head_dim must be an int or a tuple of two ints")
self.keys = None
self.values = None
self.offset = 0
@@ -25,9 +30,10 @@ class KVCache:
prev = self.offset
if self.keys is None or (prev + keys.shape[2]) > self.keys.shape[2]:
n_steps = (self.step + keys.shape[2] - 1) // self.step
shape = (1, self.n_kv_heads, n_steps * self.step, self.head_dim)
new_k = mx.zeros(shape, keys.dtype)
new_v = mx.zeros(shape, values.dtype)
k_shape = (1, self.n_kv_heads, n_steps * self.step, self.k_head_dim)
v_shape = (1, self.n_kv_heads, n_steps * self.step, self.v_head_dim)
new_k = mx.zeros(k_shape, keys.dtype)
new_v = mx.zeros(v_shape, values.dtype)
if self.keys is not None:
if prev % self.step != 0:
self.keys = self.keys[..., :prev, :]