From 0793fd070e5c116fa677f7a5f9608875976e9b52 Mon Sep 17 00:00:00 2001
From: Awni Hannun <awni@apple.com>
Date: Tue, 29 Oct 2024 17:38:53 -0700
Subject: [PATCH] comment + nit

---
 llms/README.md       | 50 ++++++++++++++++++++++----------------------
 llms/mlx_lm/utils.py | 18 +++++++++++++---
 2 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/llms/README.md b/llms/README.md
index fd625879..f539988a 100644
--- a/llms/README.md
+++ b/llms/README.md
@@ -201,31 +201,6 @@ requests that use the same context. See the
 [example](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/examples/chat.py)
 for more usage details.
 
-### Slow Speed with Large Models
-
-> [!NOTE]
-    This requires macOS 15.0 or higher to work.
-
-Models which are large relative to the total RAM available on the machine can
-be slow. `mlx-lm` will attempt to make them faster by keeping the memory
-occupied by the model and cache wired. This requires macOS 15 or higher to
-work.
-
-If you see the following warning message:
-
-> [WARNING] Generating with a model that requires ...
-
-then the model will likely be very slow on the given machine. These cases can
-be sped up for models which fit in RAM with some room to spare. To increase the
-maximum wired limit, set the following `sysctl`:
-
-```bash
-sudo sysctl iogpu.wired_limit_mb=N
-```
-
-The value `N` should be larger than the size of the model in megabytes but
-smaller than the memory size of the machine.
-
 ### Supported Models
 
 `mlx-lm` supports thousands of Hugging Face format LLMs. If the model you want to
@@ -273,3 +248,28 @@ model, tokenizer = load(
     tokenizer_config={"eos_token": "<|endoftext|>", "trust_remote_code": True},
 )
 ```
+
+### Large Models
+
+> [!NOTE]
+    This requires macOS 15.0 or higher to work.
+
+Models which are large relative to the total RAM available on the machine can
+be slow. `mlx-lm` will attempt to make them faster by wiring the memory
+occupied by the model and cache. This requires macOS 15 or higher to
+work.
+
+If you see the following warning message:
+
+> [WARNING] Generating with a model that requires ...
+
+then the model will likely be slow on the given machine. If the model fits in
+RAM then it can often be sped up by increasing the system wired memory limit.
+To increase the limit, set the following `sysctl`:
+
+```bash
+sudo sysctl iogpu.wired_limit_mb=N
+```
+
+The value `N` should be larger than the size of the model in megabytes but
+smaller than the memory size of the machine.
diff --git a/llms/mlx_lm/utils.py b/llms/mlx_lm/utils.py
index a7a998f4..5b437c98 100644
--- a/llms/mlx_lm/utils.py
+++ b/llms/mlx_lm/utils.py
@@ -41,7 +41,14 @@ class ModelNotFoundError(Exception):
 
 
 @contextlib.contextmanager
-def wired_limit(model):
+def wired_limit(model: nn.Module, streams: Optional[List[mx.Stream]] = None):
+    """
+    A context manager to temporarily change the wired limit.
+
+    Note, the wired limit should not be changed during an async eval.  If an
+    async eval could be running pass in the streams to synchronize with prior
+    to exiting the context manager.
+    """
     model_bytes = tree_reduce(
         lambda acc, x: acc + x.nbytes if isinstance(x, mx.array) else acc, model, 0
     )
@@ -52,13 +59,18 @@ def wired_limit(model):
         print(
             "[WARNING] Generating with a model that requires {model_mb} MB "
             "which is close to the maximum recommended size of {max_rec_mb} "
-            "MB. This can be very slow. See the documentation for possible work-arounds: "
+            "MB. This can be slow. See the documentation for possible work-arounds: "
+            "https://github.com/ml-explore/mlx-examples/tree/main/llms#large-models"
         )
     old_limit = mx.metal.set_wired_limit(max_rec_size)
     try:
         yield None
     finally:
-        mx.synchronize()
+        if streams is not None:
+            for s in streams:
+                mx.synchronize(s)
+        else:
+            mx.synchronize()
         mx.metal.set_wired_limit(old_limit)