Faster synchronization Fence primitive (#1773)

* try faster synchronization move event fixes update bench fix fix * non-functioning kernel * try alternative fence * cleanup barrier * get rid of event_fence * update benchmarks * doc string in metal fence
2025-10-17 14:58:13 +08:00 · 2025-01-17 18:42:19 -08:00
parent 0c259961ac
commit a4667da1eb
11 changed files with 362 additions and 31 deletions
--- a/benchmarks/python/synchronize_bench.py
+++ b/benchmarks/python/synchronize_bench.py
@@ -0,0 +1,55 @@
+import time
+
+import mlx.core as mx
+
+rank = mx.distributed.init().rank()
+
+
+def timeit(fn, a):
+
+    # warmup
+    for _ in range(5):
+        mx.eval(fn(a))
+
+    its = 10
+    tic = time.perf_counter()
+    for _ in range(its):
+        mx.eval(fn(a))
+    toc = time.perf_counter()
+    ms = 1000 * (toc - tic) / its
+    return ms
+
+
+def all_reduce_benchmark():
+    a = mx.ones((5, 5), mx.int32)
+
+    its_per_eval = 100
+
+    def fn(x):
+        for _ in range(its_per_eval):
+            x = mx.distributed.all_sum(x)
+            x = x - 1
+        return x
+
+    ms = timeit(fn, a) / its_per_eval
+    if rank == 0:
+        print(f"All Reduce: time per iteration {ms:.6f} (ms)")
+
+
+def all_gather_benchmark():
+    a = mx.ones((5, 5), mx.int32)
+    its_per_eval = 100
+
+    def fn(x):
+        for _ in range(its_per_eval):
+            x = mx.distributed.all_gather(x)[0]
+        return x
+
+    ms = timeit(fn, a) / its_per_eval
+    if rank == 0:
+        print(f"All gather: time per iteration {ms:.6f} (ms)")
+
+
+if __name__ == "__main__":
+    all_reduce_benchmark()
+    all_gather_benchmark()