mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-09-01 12:49:50 +08:00
Prompt caching in mlx_lm.server
(#1026)
* caching in server * nits * fix tests * don't throw if no metal * comments
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
# Copyright © 2024 Apple Inc.
|
||||
|
||||
import copy
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
@@ -215,6 +216,28 @@ class TestPromptCache(unittest.TestCase):
|
||||
all(mx.allclose(l, l2) for l, l2 in zip(all_logits, second_all_logits))
|
||||
)
|
||||
|
||||
def test_cache_copying(self):
|
||||
cache = [KVCache()]
|
||||
|
||||
x = mx.random.uniform(shape=(1, 8, 10, 4))
|
||||
cache[0].update_and_fetch(x, x)
|
||||
|
||||
y = mx.random.uniform(shape=(1, 8, 1, 4))
|
||||
cache[0].update_and_fetch(y, y)
|
||||
|
||||
old_cache = copy.deepcopy(cache)
|
||||
|
||||
trim_prompt_cache(cache, 1)
|
||||
|
||||
self.assertTrue(old_cache[0].offset, 11)
|
||||
self.assertTrue(cache[0].offset, 10)
|
||||
|
||||
z = mx.random.uniform(shape=(1, 8, 1, 4))
|
||||
cache[0].update_and_fetch(z, z)
|
||||
|
||||
self.assertTrue(mx.allclose(old_cache[0].keys[..., 10:11, :], y))
|
||||
self.assertTrue(mx.allclose(cache[0].keys[..., 10:11, :], z))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@@ -14,6 +14,7 @@ class DummyModelProvider:
|
||||
def __init__(self):
|
||||
HF_MODEL_PATH = "mlx-community/Qwen1.5-0.5B-Chat-4bit"
|
||||
self.model, self.tokenizer = load(HF_MODEL_PATH)
|
||||
self.model_key = (HF_MODEL_PATH, None)
|
||||
|
||||
def load(self, model, adapter=None):
|
||||
assert model in ["default_model", "chat_model"]
|
||||
|
Reference in New Issue
Block a user