reduction moved to CPU in case of distributed training

2025-08-29 06:54:39 +08:00 · 2025-01-11 00:32:54 +01:00 · 2025-01-11 00:32:54 +01:00 · ff1719afc3
commit ff1719afc3
parent 514502da22
1 changed files with 2 additions and 1 deletions
--- a/llms/mlx_lm/tuner/trainer.py
+++ b/llms/mlx_lm/tuner/trainer.py
@ -160,7 +160,8 @@ def evaluate(
        mx.eval(all_losses, ntokens)
    all_losses = mx.distributed.all_sum(all_losses)
-    ntokens = mx.distributed.all_sum(ntokens)
+    stream = mx.cpu if mx.distributed.init().size() > 1 else None
    ntokens = mx.distributed.all_sum(ntokens, stream=stream)
    return (all_losses / ntokens).item()