From 4c60cb8ef978765ca4ff4607842c66e5a52ddfef Mon Sep 17 00:00:00 2001 From: Jeonghyun Lee Date: Thu, 13 Mar 2025 17:24:58 +0900 Subject: [PATCH] Fix prompt cache issue in server.py. Honestly I don't understand why this fixes it, but prompt_cache seems to become functional with this change. --- llms/mlx_lm/server.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/llms/mlx_lm/server.py b/llms/mlx_lm/server.py index de02704d..1a39ab8b 100644 --- a/llms/mlx_lm/server.py +++ b/llms/mlx_lm/server.py @@ -452,17 +452,24 @@ class APIHandler(BaseHTTPRequestHandler): def get_prompt_cache(self, prompt): cache_len = len(self.prompt_cache.tokens) + # Check if the cache is valid for the current prompt if ( self.prompt_cache.model_key != self.model_provider.model_key or cache_len >= len(prompt) or self.prompt_cache.tokens != prompt[:cache_len] ): + # Reinitialize the cache entirely self.prompt_cache.model_key = self.model_provider.model_key self.prompt_cache.cache = make_prompt_cache(self.model_provider.model) + # Reset the cache tokens to be empty because the cache was re-created + self.prompt_cache.tokens = [] + new_prompt = prompt else: - prompt = prompt[cache_len:] - self.prompt_cache.tokens.extend(prompt) - return prompt + # Use the already cached tokens; only process the tail of the prompt + new_prompt = prompt[cache_len:] + # Update the cache tokens with the new tokens being processed + self.prompt_cache.tokens.extend(new_prompt) + return new_prompt def handle_completion( self,