From 0793fd070e5c116fa677f7a5f9608875976e9b52 Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Tue, 29 Oct 2024 17:38:53 -0700 Subject: [PATCH] comment + nit --- llms/README.md | 50 ++++++++++++++++++++++---------------------- llms/mlx_lm/utils.py | 18 +++++++++++++--- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/llms/README.md b/llms/README.md index fd625879..f539988a 100644 --- a/llms/README.md +++ b/llms/README.md @@ -201,31 +201,6 @@ requests that use the same context. See the [example](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/examples/chat.py) for more usage details. -### Slow Speed with Large Models - -> [!NOTE] - This requires macOS 15.0 or higher to work. - -Models which are large relative to the total RAM available on the machine can -be slow. `mlx-lm` will attempt to make them faster by keeping the memory -occupied by the model and cache wired. This requires macOS 15 or higher to -work. - -If you see the following warning message: - -> [WARNING] Generating with a model that requires ... - -then the model will likely be very slow on the given machine. These cases can -be sped up for models which fit in RAM with some room to spare. To increase the -maximum wired limit, set the following `sysctl`: - -```bash -sudo sysctl iogpu.wired_limit_mb=N -``` - -The value `N` should be larger than the size of the model in megabytes but -smaller than the memory size of the machine. - ### Supported Models `mlx-lm` supports thousands of Hugging Face format LLMs. If the model you want to @@ -273,3 +248,28 @@ model, tokenizer = load( tokenizer_config={"eos_token": "<|endoftext|>", "trust_remote_code": True}, ) ``` + +### Large Models + +> [!NOTE] + This requires macOS 15.0 or higher to work. + +Models which are large relative to the total RAM available on the machine can +be slow. `mlx-lm` will attempt to make them faster by wiring the memory +occupied by the model and cache. This requires macOS 15 or higher to +work. + +If you see the following warning message: + +> [WARNING] Generating with a model that requires ... + +then the model will likely be slow on the given machine. If the model fits in +RAM then it can often be sped up by increasing the system wired memory limit. +To increase the limit, set the following `sysctl`: + +```bash +sudo sysctl iogpu.wired_limit_mb=N +``` + +The value `N` should be larger than the size of the model in megabytes but +smaller than the memory size of the machine. diff --git a/llms/mlx_lm/utils.py b/llms/mlx_lm/utils.py index a7a998f4..5b437c98 100644 --- a/llms/mlx_lm/utils.py +++ b/llms/mlx_lm/utils.py @@ -41,7 +41,14 @@ class ModelNotFoundError(Exception): @contextlib.contextmanager -def wired_limit(model): +def wired_limit(model: nn.Module, streams: Optional[List[mx.Stream]] = None): + """ + A context manager to temporarily change the wired limit. + + Note, the wired limit should not be changed during an async eval. If an + async eval could be running pass in the streams to synchronize with prior + to exiting the context manager. + """ model_bytes = tree_reduce( lambda acc, x: acc + x.nbytes if isinstance(x, mx.array) else acc, model, 0 ) @@ -52,13 +59,18 @@ def wired_limit(model): print( "[WARNING] Generating with a model that requires {model_mb} MB " "which is close to the maximum recommended size of {max_rec_mb} " - "MB. This can be very slow. See the documentation for possible work-arounds: " + "MB. This can be slow. See the documentation for possible work-arounds: " + "https://github.com/ml-explore/mlx-examples/tree/main/llms#large-models" ) old_limit = mx.metal.set_wired_limit(max_rec_size) try: yield None finally: - mx.synchronize() + if streams is not None: + for s in streams: + mx.synchronize(s) + else: + mx.synchronize() mx.metal.set_wired_limit(old_limit)