add option to wire model

2025-08-30 02:53:41 +08:00 · 2024-10-21 13:35:10 -07:00 · 2024-10-21 13:35:10 -07:00 · 645423946a
commit 645423946a
parent 743763bc2e
1 changed files with 16 additions and 0 deletions
--- a/llms/mlx_lm/generate.py
+++ b/llms/mlx_lm/generate.py
@ -107,6 +107,14 @@ def setup_arg_parser():
        default=None,
        help="A file containing saved KV caches to avoid recomputing them",
    )
    parser.add_argument(
        "--wire-model",
        "-w",
        action="store_true",
        help=("Keep the model resident in memory. This can substantially "
         "speedup generation for models large relative to the machine's RAM.")
    )
    return parser
@ -216,6 +224,14 @@ def main():
        raise ValueError("Cannot use --colorize with --verbose=False")
    formatter = colorprint_by_t0 if args.colorize else None
    if args.wire_model:
        wired_bytes = mx.metal.wire(model)
        if wired_bytes >= mx.metal.device_info()["max_recommended_working_set_size"]:
            raise ValueError(
                "Cannot wire a model larger than the available RAM. You may "
                "be able to increase the available RAM by setting "
                "`sudo sysctl iogpu.wired_limit_mb=N` to a larger value")
    response = generate(
        model,
        tokenizer,