docs

2025-08-31 20:04:38 +08:00 · 2024-10-05 18:34:50 -07:00
parent 4dc3cc0300
commit daf79f37cf
2 changed files with 13 additions and 1 deletions
--- a/llms/README.md
+++ b/llms/README.md
@@ -170,6 +170,12 @@ The cached prompt is treated as a prefix to the supplied prompt. Also notice
 when using a cached prompt, the model to use is read from the cache and need
 not be supplied explicitly.

+Prompt caching can also be used in the Python API in order to to avoid
+recomputing the prompt. This is useful in multi-turn dialogues or across
+requests that use the same context. See the
+[example](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/examples/chat.py)
+for more usage details.
+
 ### Supported Models

 MLX LM supports thousands of Hugging Face format LLMs. If the model you want to
--- a/llms/mlx_lm/examples/chat.py
+++ b/llms/mlx_lm/examples/chat.py
@@ -5,7 +5,7 @@ An example of a multi-turn chat with prompt caching.
 """

 from mlx_lm import generate, load
-from mlx_lm.models.cache import make_prompt_cache
+from mlx_lm.models.cache import load_prompt_cache, make_prompt_cache, save_prompt_cache

 model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.3-4bit")

@@ -45,3 +45,9 @@ response = generate(
    temp=0.0,
    prompt_cache=prompt_cache,
 )
+
+# Save the prompt cache to disk to reuse it at a later time
+save_prompt_cache("mistral_prompt.safetensors", prompt_cache)
+
+# Load the prompt cache from disk
+prompt_cache = load_prompt_cache("mistral_prompt.safetensors")