From 1ac75759ee00e7de47bc16ef4fba07a9e45af664 Mon Sep 17 00:00:00 2001
From: Awni Hannun <awni@apple.com>
Date: Wed, 9 Oct 2024 14:04:42 -0700
Subject: [PATCH] clear cache during prompt processing

---
 llms/mlx_lm/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llms/mlx_lm/utils.py b/llms/mlx_lm/utils.py
index cfbcf29e..1e07546e 100644
--- a/llms/mlx_lm/utils.py
+++ b/llms/mlx_lm/utils.py
@@ -242,6 +242,7 @@ def generate_step(
         model(y[:prefill_step_size][None], cache=prompt_cache)
         mx.eval([c.state for c in prompt_cache])
         y = y[prefill_step_size:]
+        mx.metal.clear_cache()
 
     y, logprobs = _step(y)