Prompt caching in mlx_lm.server (#1026)

* caching in server * nits * fix tests * don't throw if no metal * comments
2025-10-23 22:18:06 +08:00 · 2024-10-14 10:57:22 -07:00
parent 8dca1a2f60
commit 605c4854f1
4 changed files with 151 additions and 32 deletions
--- a/llms/tests/test_prompt_cache.py
+++ b/llms/tests/test_prompt_cache.py
@@ -1,5 +1,6 @@
 # Copyright © 2024 Apple Inc.

+import copy
 import os
 import tempfile
 import unittest
@@ -215,6 +216,28 @@ class TestPromptCache(unittest.TestCase):
            all(mx.allclose(l, l2) for l, l2 in zip(all_logits, second_all_logits))
        )

+    def test_cache_copying(self):
+        cache = [KVCache()]
+
+        x = mx.random.uniform(shape=(1, 8, 10, 4))
+        cache[0].update_and_fetch(x, x)
+
+        y = mx.random.uniform(shape=(1, 8, 1, 4))
+        cache[0].update_and_fetch(y, y)
+
+        old_cache = copy.deepcopy(cache)
+
+        trim_prompt_cache(cache, 1)
+
+        self.assertTrue(old_cache[0].offset, 11)
+        self.assertTrue(cache[0].offset, 10)
+
+        z = mx.random.uniform(shape=(1, 8, 1, 4))
+        cache[0].update_and_fetch(z, z)
+
+        self.assertTrue(mx.allclose(old_cache[0].keys[..., 10:11, :], y))
+        self.assertTrue(mx.allclose(cache[0].keys[..., 10:11, :], z))
+

 if __name__ == "__main__":
    unittest.main()
--- a/llms/tests/test_server.py
+++ b/llms/tests/test_server.py
@@ -14,6 +14,7 @@ class DummyModelProvider:
    def __init__(self):
        HF_MODEL_PATH = "mlx-community/Qwen1.5-0.5B-Chat-4bit"
        self.model, self.tokenizer = load(HF_MODEL_PATH)
+        self.model_key = (HF_MODEL_PATH, None)

    def load(self, model, adapter=None):
        assert model in ["default_model", "chat_model"]