Data parallel helper (#1407)

This commit is contained in:
Angelos Katharopoulos
2024-09-16 18:17:21 -07:00
committed by GitHub
parent 8d68a3e805
commit 914409fef9
3 changed files with 213 additions and 7 deletions

View File

@@ -4,6 +4,7 @@ import unittest
import mlx.core as mx
import mlx_tests
from mlx.nn.utils import average_gradients
class TestDistributed(mlx_tests.MLXTestCase):
@@ -110,6 +111,59 @@ class TestDistributed(mlx_tests.MLXTestCase):
self.assertTrue(mx.all(x == (1024 if pairs.rank() == 0 else 512)))
def test_average_gradients(self):
original_all_sum = mx.distributed.all_sum
n_calls = 0
xtype = None
def new_all_sum(x, **kwargs):
nonlocal n_calls
nonlocal xtype
n_calls += 1
if xtype is not None:
self.assertEqual(xtype, x.dtype)
return original_all_sum(x, **kwargs)
mx.distributed.all_sum = new_all_sum
try:
grads = [mx.ones(10) for i in range(10)]
new_grads = average_gradients(grads)
mx.eval(new_grads)
self.assertEqual(len(new_grads), 10)
self.assertTrue(all(mx.all(g == 1) for g in new_grads))
self.assertEqual(n_calls, 1)
n_calls = 0
new_grads = average_gradients(grads, all_reduce_size=4 * 50)
mx.eval(new_grads)
self.assertEqual(len(new_grads), 10)
self.assertTrue(all(mx.all(g == 1) for g in new_grads))
self.assertEqual(n_calls, 2)
n_calls = 0
new_grads = average_gradients(grads, all_reduce_size=0)
mx.eval(new_grads)
self.assertEqual(len(new_grads), 10)
self.assertTrue(all(mx.all(g == 1) for g in new_grads))
self.assertEqual(n_calls, 10)
n_calls = 0
xtype = mx.float16
new_grads = average_gradients(
grads, all_reduce_size=2 * 50, communication_type=mx.float16
)
mx.eval(new_grads)
self.assertEqual(len(new_grads), 10)
self.assertTrue(all(g.dtype == mx.float32 for g in new_grads))
self.assertTrue(all(mx.all(g == 1) for g in new_grads))
self.assertEqual(n_calls, 2)
finally:
mx.distributed.all_sum = original_all_sum
if __name__ == "__main__":
unittest.main()