mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-08-30 02:53:41 +08:00
comment + nit
This commit is contained in:
parent
2822acc98a
commit
0793fd070e
@ -201,31 +201,6 @@ requests that use the same context. See the
|
|||||||
[example](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/examples/chat.py)
|
[example](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/examples/chat.py)
|
||||||
for more usage details.
|
for more usage details.
|
||||||
|
|
||||||
### Slow Speed with Large Models
|
|
||||||
|
|
||||||
> [!NOTE]
|
|
||||||
This requires macOS 15.0 or higher to work.
|
|
||||||
|
|
||||||
Models which are large relative to the total RAM available on the machine can
|
|
||||||
be slow. `mlx-lm` will attempt to make them faster by keeping the memory
|
|
||||||
occupied by the model and cache wired. This requires macOS 15 or higher to
|
|
||||||
work.
|
|
||||||
|
|
||||||
If you see the following warning message:
|
|
||||||
|
|
||||||
> [WARNING] Generating with a model that requires ...
|
|
||||||
|
|
||||||
then the model will likely be very slow on the given machine. These cases can
|
|
||||||
be sped up for models which fit in RAM with some room to spare. To increase the
|
|
||||||
maximum wired limit, set the following `sysctl`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
sudo sysctl iogpu.wired_limit_mb=N
|
|
||||||
```
|
|
||||||
|
|
||||||
The value `N` should be larger than the size of the model in megabytes but
|
|
||||||
smaller than the memory size of the machine.
|
|
||||||
|
|
||||||
### Supported Models
|
### Supported Models
|
||||||
|
|
||||||
`mlx-lm` supports thousands of Hugging Face format LLMs. If the model you want to
|
`mlx-lm` supports thousands of Hugging Face format LLMs. If the model you want to
|
||||||
@ -273,3 +248,28 @@ model, tokenizer = load(
|
|||||||
tokenizer_config={"eos_token": "<|endoftext|>", "trust_remote_code": True},
|
tokenizer_config={"eos_token": "<|endoftext|>", "trust_remote_code": True},
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Large Models
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
This requires macOS 15.0 or higher to work.
|
||||||
|
|
||||||
|
Models which are large relative to the total RAM available on the machine can
|
||||||
|
be slow. `mlx-lm` will attempt to make them faster by wiring the memory
|
||||||
|
occupied by the model and cache. This requires macOS 15 or higher to
|
||||||
|
work.
|
||||||
|
|
||||||
|
If you see the following warning message:
|
||||||
|
|
||||||
|
> [WARNING] Generating with a model that requires ...
|
||||||
|
|
||||||
|
then the model will likely be slow on the given machine. If the model fits in
|
||||||
|
RAM then it can often be sped up by increasing the system wired memory limit.
|
||||||
|
To increase the limit, set the following `sysctl`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo sysctl iogpu.wired_limit_mb=N
|
||||||
|
```
|
||||||
|
|
||||||
|
The value `N` should be larger than the size of the model in megabytes but
|
||||||
|
smaller than the memory size of the machine.
|
||||||
|
@ -41,7 +41,14 @@ class ModelNotFoundError(Exception):
|
|||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def wired_limit(model):
|
def wired_limit(model: nn.Module, streams: Optional[List[mx.Stream]] = None):
|
||||||
|
"""
|
||||||
|
A context manager to temporarily change the wired limit.
|
||||||
|
|
||||||
|
Note, the wired limit should not be changed during an async eval. If an
|
||||||
|
async eval could be running pass in the streams to synchronize with prior
|
||||||
|
to exiting the context manager.
|
||||||
|
"""
|
||||||
model_bytes = tree_reduce(
|
model_bytes = tree_reduce(
|
||||||
lambda acc, x: acc + x.nbytes if isinstance(x, mx.array) else acc, model, 0
|
lambda acc, x: acc + x.nbytes if isinstance(x, mx.array) else acc, model, 0
|
||||||
)
|
)
|
||||||
@ -52,13 +59,18 @@ def wired_limit(model):
|
|||||||
print(
|
print(
|
||||||
"[WARNING] Generating with a model that requires {model_mb} MB "
|
"[WARNING] Generating with a model that requires {model_mb} MB "
|
||||||
"which is close to the maximum recommended size of {max_rec_mb} "
|
"which is close to the maximum recommended size of {max_rec_mb} "
|
||||||
"MB. This can be very slow. See the documentation for possible work-arounds: "
|
"MB. This can be slow. See the documentation for possible work-arounds: "
|
||||||
|
"https://github.com/ml-explore/mlx-examples/tree/main/llms#large-models"
|
||||||
)
|
)
|
||||||
old_limit = mx.metal.set_wired_limit(max_rec_size)
|
old_limit = mx.metal.set_wired_limit(max_rec_size)
|
||||||
try:
|
try:
|
||||||
yield None
|
yield None
|
||||||
finally:
|
finally:
|
||||||
mx.synchronize()
|
if streams is not None:
|
||||||
|
for s in streams:
|
||||||
|
mx.synchronize(s)
|
||||||
|
else:
|
||||||
|
mx.synchronize()
|
||||||
mx.metal.set_wired_limit(old_limit)
|
mx.metal.set_wired_limit(old_limit)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user