Comms (#1097)

* Start the communications branch using MPI * Add ops and primitives * Add python bindings for distributed
2025-12-08 04:08:54 +08:00 · 2024-05-23 17:04:02 -07:00
parent 0189ab6ab6
commit 50dfb664db
19 changed files with 913 additions and 1 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -71,6 +71,7 @@ jobs:
          name: Install dependencies
          command: |
            brew install python@3.8
+            brew install openmpi
            python3.8 -m venv env
            source env/bin/activate
            pip install --upgrade pip
@@ -96,6 +97,7 @@ jobs:
            source env/bin/activate
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
+            mpirun -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
      - run:
          name: Build example extension
          command: |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -167,6 +167,11 @@ else()
  set(MLX_BUILD_ACCELERATE OFF)
 endif()

+find_package(MPI)
+if (MPI_FOUND)
+    target_include_directories(mlx PRIVATE ${MPI_INCLUDE_PATH})
+endif()
+
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)

 target_include_directories(
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -9,3 +9,4 @@ build_example(tutorial.cpp)
 build_example(linear_regression.cpp)
 build_example(logistic_regression.cpp)
 build_example(metal_capture.cpp)
+build_example(distributed.cpp)
--- a/examples/cpp/distributed.cpp
+++ b/examples/cpp/distributed.cpp
@@ -0,0 +1,22 @@
+// Copyright © 2024 Apple Inc.
+
+#include <iostream>
+
+#include "mlx/mlx.h"
+
+using namespace mlx::core;
+
+int main() {
+  if (!distributed::is_available()) {
+    std::cout << "No communication backend found" << std::endl;
+    return 1;
+  }
+
+  auto global_group = distributed::init();
+  std::cout << global_group.rank() << " / " << global_group.size() << std::endl;
+
+  array x = ones({10});
+  array out = distributed::all_reduce_sum(x, global_group);
+
+  std::cout << out << std::endl;
+}
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -25,6 +25,7 @@ else()
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_cpu)
 endif()

+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/distributed)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
 if (MLX_BUILD_ACCELERATE)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/accelerate)
--- a/mlx/distributed/CMakeLists.txt
+++ b/mlx/distributed/CMakeLists.txt
@@ -0,0 +1,16 @@
+target_sources(
+  mlx
+  PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
+)
+
+if (MPI_FOUND AND MLX_BUILD_CPU)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/mpi)
+else()
+  target_sources(
+    mlx
+    PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/no_distributed.cpp
+  )
+endif()
--- a/mlx/distributed/distributed.h
+++ b/mlx/distributed/distributed.h
@@ -0,0 +1,62 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include <memory>
+
+#include "mlx/array.h"
+
+namespace mlx::core::distributed {
+
+/* Check if a communication backend is available */
+bool is_available();
+
+/**
+ * A distributed::Group represents a group of independent mlx processes that
+ * can communicate. We must also be able to create sub-groups from a group in
+ * order to define more granular communication.
+ */
+struct Group {
+  Group(std::shared_ptr<void> group) : group_(group) {}
+
+  int rank();
+  int size();
+
+  /**
+   * Split the group according to the provided color. Namely processes that use
+   * the same color will go to the same group.
+   *
+   * The key defines the rank of the processes in the new group. The smaller
+   * the key the smaller the rank. If the provided key is negative, then the
+   * rank in the current group is used.
+   */
+  Group split(int color, int key = -1);
+
+  const std::shared_ptr<void>& raw_group() {
+    return group_;
+  }
+
+ private:
+  std::shared_ptr<void> group_{nullptr};
+};
+
+/**
+ * Initialize the distributed backend and return the group containing all
+ * discoverable processes.
+ */
+Group init();
+
+namespace detail {
+
+/* Return the communication stream. */
+Stream communication_stream();
+
+/* Perform an all reduce sum operation */
+void all_reduce_sum(Group group, const array& input, array& output);
+
+/* Perform an all reduce sum operation */
+void all_gather(Group group, const array& input, array& output);
+
+} // namespace detail
+
+} // namespace mlx::core::distributed
--- a/mlx/distributed/mpi/CMakeLists.txt
+++ b/mlx/distributed/mpi/CMakeLists.txt
@@ -0,0 +1,5 @@
+target_sources(
+  mlx
+  PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/mpi.cpp
+)
--- a/mlx/distributed/mpi/mpi.cpp
+++ b/mlx/distributed/mpi/mpi.cpp
@@ -0,0 +1,283 @@
+// Copyright © 2024 Apple Inc.
+
+#include <dlfcn.h>
+#include <mpi.h>
+
+#include "mlx/backend/common/copy.h"
+#include "mlx/distributed/distributed.h"
+#include "mlx/scheduler.h"
+
+#define LOAD_SYMBOL(symbol, variable)                              \
+  {                                                                \
+    variable = (decltype(variable))dlsym(libmpi_handle_, #symbol); \
+    char* error = dlerror();                                       \
+    if (error != nullptr) {                                        \
+      libmpi_handle_ = nullptr;                                    \
+      return;                                                      \
+    }                                                              \
+  }
+
+namespace mlx::core::distributed {
+
+namespace {
+
+array ensure_row_contiguous(const array& arr) {
+  if (arr.flags().row_contiguous) {
+    return arr;
+  } else {
+    array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+    copy(arr, arr_copy, CopyType::General);
+    return arr_copy;
+  }
+}
+
+struct MPIWrapper {
+  MPIWrapper() {
+    libmpi_handle_ = dlopen("libmpi.dylib", RTLD_NOW | RTLD_GLOBAL);
+    if (libmpi_handle_ == nullptr) {
+      return;
+    }
+
+    // API
+    LOAD_SYMBOL(MPI_Init, init);
+    LOAD_SYMBOL(MPI_Finalize, finalize);
+    LOAD_SYMBOL(MPI_Comm_rank, rank);
+    LOAD_SYMBOL(MPI_Comm_size, size);
+    LOAD_SYMBOL(MPI_Comm_split, comm_split);
+    LOAD_SYMBOL(MPI_Comm_free, comm_free);
+    LOAD_SYMBOL(MPI_Allreduce, all_reduce);
+    LOAD_SYMBOL(MPI_Allgather, all_gather);
+
+    // Objects
+    LOAD_SYMBOL(ompi_mpi_comm_world, comm_world_);
+
+    // Ops
+    LOAD_SYMBOL(ompi_mpi_op_sum, op_sum_);
+
+    // Datatypes
+    LOAD_SYMBOL(ompi_mpi_c_bool, mpi_bool_);
+    LOAD_SYMBOL(ompi_mpi_int8_t, mpi_int8_);
+    LOAD_SYMBOL(ompi_mpi_uint8_t, mpi_uint8_);
+    LOAD_SYMBOL(ompi_mpi_int16_t, mpi_int16_);
+    LOAD_SYMBOL(ompi_mpi_uint16_t, mpi_uint16_);
+    LOAD_SYMBOL(ompi_mpi_int32_t, mpi_int32_);
+    LOAD_SYMBOL(ompi_mpi_uint32_t, mpi_uint32_);
+    LOAD_SYMBOL(ompi_mpi_int64_t, mpi_int64_);
+    LOAD_SYMBOL(ompi_mpi_uint64_t, mpi_uint64_);
+    LOAD_SYMBOL(ompi_mpi_float, mpi_float_);
+    LOAD_SYMBOL(ompi_mpi_c_complex, mpi_complex_);
+  }
+
+  bool is_available() {
+    return libmpi_handle_ != nullptr;
+  }
+
+  bool init_safe() {
+    if (!is_available()) {
+      return false;
+    }
+    return init(nullptr, nullptr) == MPI_SUCCESS;
+  }
+
+  void finalize_safe() {
+    if (is_available()) {
+      finalize();
+    }
+  }
+
+  MPI_Comm world() {
+    return comm_world_;
+  }
+
+  MPI_Datatype datatype(const array& arr) {
+    switch (arr.dtype()) {
+      case bool_:
+        return mpi_bool_;
+      case int8:
+        return mpi_int8_;
+      case uint8:
+        return mpi_uint8_;
+      case int16:
+        return mpi_int16_;
+      case uint16:
+        return mpi_uint16_;
+      case int32:
+        return mpi_int32_;
+      case uint32:
+        return mpi_uint32_;
+      case int64:
+        return mpi_int64_;
+      case uint64:
+        return mpi_uint64_;
+      case float32:
+        return mpi_float_;
+      case complex64:
+        return mpi_complex_;
+      case float16:
+      case bfloat16:
+        throw std::runtime_error("MPI doesn't support 16-bit floats");
+    }
+  }
+
+  MPI_Op op_sum() {
+    return op_sum_;
+  }
+
+  void* libmpi_handle_;
+
+  // API
+  int (*init)(int*, char***);
+  int (*finalize)();
+  int (*rank)(MPI_Comm, int*);
+  int (*size)(MPI_Comm, int*);
+  int (*all_reduce)(const void*, void*, int, MPI_Datatype, MPI_Op, MPI_Comm);
+  int (*all_gather)(
+      const void*,
+      int,
+      MPI_Datatype,
+      void*,
+      int,
+      MPI_Datatype,
+      MPI_Comm);
+  int (*comm_split)(MPI_Comm, int, int, MPI_Comm*);
+  int (*comm_free)(MPI_Comm*);
+
+  // Objects
+  MPI_Comm comm_world_;
+
+  // Ops
+  MPI_Op op_sum_;
+
+  // Datatypes
+  MPI_Datatype mpi_bool_;
+  MPI_Datatype mpi_int8_;
+  MPI_Datatype mpi_uint8_;
+  MPI_Datatype mpi_int16_;
+  MPI_Datatype mpi_uint16_;
+  MPI_Datatype mpi_int32_;
+  MPI_Datatype mpi_uint32_;
+  MPI_Datatype mpi_int64_;
+  MPI_Datatype mpi_uint64_;
+  MPI_Datatype mpi_float_;
+  MPI_Datatype mpi_complex_;
+};
+
+MPIWrapper& mpi() {
+  static MPIWrapper wrapper;
+  return wrapper;
+}
+
+struct MPIGroupImpl {
+  MPIGroupImpl(MPI_Comm comm, bool global)
+      : comm_(comm), global_(global), rank_(-1), size_(-1) {}
+  ~MPIGroupImpl() {
+    if (global_) {
+      mpi().finalize_safe();
+    } else {
+      mpi().comm_free(&comm_);
+    }
+  }
+
+  MPI_Comm comm() {
+    return comm_;
+  }
+
+  int rank() {
+    if (rank_ < 0) {
+      mpi().rank(comm_, &rank_);
+    }
+    return rank_;
+  }
+
+  int size() {
+    if (size_ < 0) {
+      mpi().size(comm_, &size_);
+    }
+    return size_;
+  }
+
+ private:
+  MPI_Comm comm_;
+  bool global_;
+  int rank_;
+  int size_;
+};
+
+MPI_Comm to_comm(Group& group) {
+  return std::static_pointer_cast<MPIGroupImpl>(group.raw_group())->comm();
+}
+
+} // namespace
+
+int Group::rank() {
+  return std::static_pointer_cast<MPIGroupImpl>(group_)->rank();
+}
+
+int Group::size() {
+  return std::static_pointer_cast<MPIGroupImpl>(group_)->size();
+}
+
+Group Group::split(int color, int key) {
+  auto mpi_group = std::static_pointer_cast<MPIGroupImpl>(group_);
+
+  key = (key < 0) ? rank() : key;
+
+  MPI_Comm new_comm;
+  int result = mpi().comm_split(mpi_group->comm(), color, key, &new_comm);
+  if (result != MPI_SUCCESS) {
+    throw std::runtime_error("MPI could not split this group");
+  }
+
+  return Group(std::make_shared<MPIGroupImpl>(new_comm, false));
+}
+
+bool is_available() {
+  return mpi().is_available();
+}
+
+Group init() {
+  static std::shared_ptr<MPIGroupImpl> global_group = nullptr;
+
+  if (global_group == nullptr) {
+    if (!mpi().init_safe()) {
+      throw std::runtime_error("Cannot initialize MPI");
+    }
+    global_group = std::make_shared<MPIGroupImpl>(mpi().world(), true);
+  }
+
+  return Group(global_group);
+}
+
+namespace detail {
+
+Stream communication_stream() {
+  static Stream comm_stream = new_stream(Device::cpu);
+  return comm_stream;
+}
+
+void all_reduce_sum(Group group, const array& input_, array& output) {
+  array input = ensure_row_contiguous(input_);
+  mpi().all_reduce(
+      input.data<void>(),
+      output.data<void>(),
+      input.size(),
+      mpi().datatype(input),
+      mpi().op_sum(),
+      to_comm(group));
+}
+
+void all_gather(Group group, const array& input_, array& output) {
+  array input = ensure_row_contiguous(input_);
+  mpi().all_gather(
+      input.data<void>(),
+      input.size(),
+      mpi().datatype(input),
+      output.data<void>(),
+      input.size(),
+      mpi().datatype(output),
+      to_comm(group));
+}
+
+} // namespace detail
+
+} // namespace mlx::core::distributed
--- a/mlx/distributed/no_distributed.cpp
+++ b/mlx/distributed/no_distributed.cpp
@@ -0,0 +1,39 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/distributed/distributed.h"
+
+namespace mlx::core::distributed {
+
+int Group::rank() {
+  return 0;
+}
+
+int Group::size() {
+  return 1;
+}
+
+Group Group::split(int color, int key) {
+  throw std::runtime_error("Cannot split the distributed group further");
+}
+
+bool is_available() {
+  return false;
+}
+
+Group init() {
+  return Group(nullptr);
+}
+
+namespace detail {
+
+Stream communication_stream() {
+  static Stream comm_stream = new_stream(Device::cpu);
+  return comm_stream;
+}
+
+void all_reduce_sum(Group group, const array& input, array& output) {}
+void all_gather(Group group, const array& input, array& output) {}
+
+} // namespace detail
+
+} // namespace mlx::core::distributed
--- a/mlx/distributed/ops.cpp
+++ b/mlx/distributed/ops.cpp
@@ -0,0 +1,54 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/distributed/ops.h"
+#include "mlx/distributed/primitives.h"
+
+namespace mlx::core::distributed {
+
+namespace {
+
+Group to_group(std::optional<Group> group) {
+  if (group.has_value()) {
+    return group.value();
+  } else {
+    return distributed::init();
+  }
+}
+
+} // namespace
+
+array all_reduce_sum(const array& x, std::optional<Group> group_) {
+  auto group = to_group(group_);
+
+  if (group.size() == 1) {
+    return x;
+  }
+
+  return array(
+      x.shape(),
+      x.dtype(),
+      std::make_shared<AllReduce>(group, AllReduce::Sum),
+      {x});
+}
+
+array all_gather(const array& x, std::optional<Group> group_) {
+  auto group = to_group(group_);
+
+  if (group.size() == 1) {
+    return x;
+  }
+
+  auto result_shape = x.shape();
+  if (result_shape.size() == 0) {
+    result_shape.push_back(group.size());
+  } else {
+    result_shape[0] *= group.size();
+  }
+  return array(
+      std::move(result_shape),
+      x.dtype(),
+      std::make_shared<AllGather>(group),
+      {x});
+}
+
+} // namespace mlx::core::distributed
--- a/mlx/distributed/ops.h
+++ b/mlx/distributed/ops.h
@@ -0,0 +1,14 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include <optional>
+
+#include "mlx/distributed/distributed.h"
+
+namespace mlx::core::distributed {
+
+array all_reduce_sum(const array& x, std::optional<Group> group = std::nullopt);
+array all_gather(const array& x, std::optional<Group> group = std::nullopt);
+
+} // namespace mlx::core::distributed
--- a/mlx/distributed/primitives.cpp
+++ b/mlx/distributed/primitives.cpp
@@ -0,0 +1,98 @@
+// Copyright © 2024 Apple Inc.
+
+#include <cassert>
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/distributed/ops.h"
+#include "mlx/distributed/primitives.h"
+#include "mlx/ops.h"
+
+namespace mlx::core::distributed {
+
+void AllReduce::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 1);
+  assert(outputs.size() == 1);
+
+  outputs[0].set_data(allocator::malloc_or_wait(outputs[0].nbytes()));
+
+  switch (reduce_type_) {
+    case Sum:
+      distributed::detail::all_reduce_sum(group(), inputs[0], outputs[0]);
+      break;
+    default:
+      throw std::runtime_error("Only all reduce sum is supported for now");
+  }
+}
+
+std::pair<std::vector<array>, std::vector<int>> AllReduce::vmap(
+    const std::vector<array>& inputs,
+    const std::vector<int>& axes) {
+  switch (reduce_type_) {
+    case Sum:
+      return {{all_reduce_sum(inputs[0], group())}, axes};
+    default:
+      throw std::runtime_error("Only all reduce sum is supported for now");
+  }
+}
+
+std::vector<array> AllReduce::jvp(
+    const std::vector<array>& primals,
+    const std::vector<array>& tangents,
+    const std::vector<int>& argnums) {
+  switch (reduce_type_) {
+    case Sum:
+      return {all_reduce_sum(tangents[0], group())};
+    default:
+      throw std::runtime_error("Only all reduce sum is supported for now");
+  }
+}
+
+std::vector<array> AllReduce::vjp(
+    const std::vector<array>& primals,
+    const std::vector<array>& cotangents,
+    const std::vector<int>& argnums,
+    const std::vector<array>& outputs) {
+  return cotangents;
+}
+
+void AllGather::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 1);
+  assert(outputs.size() == 1);
+
+  outputs[0].set_data(allocator::malloc_or_wait(outputs[0].nbytes()));
+
+  distributed::detail::all_gather(group(), inputs[0], outputs[0]);
+}
+
+std::pair<std::vector<array>, std::vector<int>> AllGather::vmap(
+    const std::vector<array>& inputs,
+    const std::vector<int>& axes) {
+  return {{all_gather(inputs[0], group())}, axes};
+}
+
+std::vector<array> AllGather::jvp(
+    const std::vector<array>& primals,
+    const std::vector<array>& tangents,
+    const std::vector<int>& argnums) {
+  return {all_gather(tangents[0], group())};
+}
+
+std::vector<array> AllGather::vjp(
+    const std::vector<array>& primals,
+    const std::vector<array>& cotangents,
+    const std::vector<int>& argnums,
+    const std::vector<array>& outputs) {
+  auto g = group();
+  std::vector<int> starts(primals[0].ndim(), 0);
+  auto stops = primals[0].shape();
+  starts[0] = g.rank() * stops[0];
+  stops[0] += starts[0];
+  return {slice(cotangents[0], starts, stops)};
+}
+
+} // namespace mlx::core::distributed
--- a/mlx/distributed/primitives.h
+++ b/mlx/distributed/primitives.h
@@ -0,0 +1,100 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/distributed/distributed.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core::distributed {
+
+class DistPrimitive : public Primitive {
+ public:
+  DistPrimitive(Group group)
+      : Primitive(detail::communication_stream()), group_(group) {}
+
+  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
+      override {
+    throw std::runtime_error(
+        "Communication primitives cannot be run on the GPU");
+  }
+
+  const Group& group() const {
+    return group_;
+  }
+
+ private:
+  Group group_;
+};
+
+class AllReduce : public DistPrimitive {
+ public:
+  enum ReduceType { And, Or, Sum, Prod, Min, Max };
+
+  AllReduce(Group group, ReduceType reduce_type)
+      : DistPrimitive(group), reduce_type_(reduce_type) {}
+
+  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
+      override;
+  std::pair<std::vector<array>, std::vector<int>> vmap(
+      const std::vector<array>& inputs,
+      const std::vector<int>& axes) override;
+  std::vector<array> jvp(
+      const std::vector<array>& primals,
+      const std::vector<array>& tangents,
+      const std::vector<int>& argnums) override;
+  std::vector<array> vjp(
+      const std::vector<array>& primals,
+      const std::vector<array>& cotangents,
+      const std::vector<int>& argnums,
+      const std::vector<array>& outputs) override;
+
+  void print(std::ostream& os) override {
+    switch (reduce_type_) {
+      case And:
+        os << "And";
+      case Or:
+        os << "And";
+        break;
+      case Sum:
+        os << "Sum";
+        break;
+      case Prod:
+        os << "Prod";
+        break;
+      case Min:
+        os << "Min";
+        break;
+      case Max:
+        os << "Max";
+        break;
+    }
+    os << " AllReduce";
+  }
+
+ private:
+  ReduceType reduce_type_;
+};
+
+class AllGather : public DistPrimitive {
+ public:
+  AllGather(Group group) : DistPrimitive(group) {}
+
+  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
+      override;
+  std::pair<std::vector<array>, std::vector<int>> vmap(
+      const std::vector<array>& inputs,
+      const std::vector<int>& axes) override;
+  std::vector<array> jvp(
+      const std::vector<array>& primals,
+      const std::vector<array>& tangents,
+      const std::vector<int>& argnums) override;
+  std::vector<array> vjp(
+      const std::vector<array>& primals,
+      const std::vector<array>& cotangents,
+      const std::vector<int>& argnums,
+      const std::vector<array>& outputs) override;
+
+  DEFINE_PRINT(AllGather);
+};
+
+} // namespace mlx::core::distributed
--- a/mlx/mlx.h
+++ b/mlx/mlx.h
@@ -6,6 +6,8 @@
 #include "mlx/backend/metal/metal.h"
 #include "mlx/compile.h"
 #include "mlx/device.h"
+#include "mlx/distributed/distributed.h"
+#include "mlx/distributed/ops.h"
 #include "mlx/fast.h"
 #include "mlx/fft.h"
 #include "mlx/io.h"
--- a/python/src/CMakeLists.txt
+++ b/python/src/CMakeLists.txt
@@ -6,6 +6,7 @@ nanobind_add_module(
  ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/convert.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
--- a/python/src/distributed.cpp
+++ b/python/src/distributed.cpp
@@ -0,0 +1,107 @@
+// Copyright  © 2024 Apple Inc.
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+
+#include "mlx/distributed/distributed.h"
+#include "mlx/distributed/ops.h"
+
+namespace nb = nanobind;
+using namespace nb::literals;
+using namespace mlx::core;
+
+void init_distributed(nb::module_& parent_module) {
+  auto m = parent_module.def_submodule(
+      "distributed", "mlx.core.distributed: Communication operations");
+
+  nb::class_<distributed::Group>(
+      m,
+      "Group",
+      R"pbcopy(
+        An :class:`mlx.core.distributed.Group` represents a group of independent mlx
+        processes that can communicate.
+      )pbcopy")
+      .def("rank", &distributed::Group::rank, "Get the rank of this process")
+      .def("size", &distributed::Group::size, "Get the size of the group")
+      .def(
+          "split",
+          &distributed::Group::split,
+          "color"_a,
+          "key"_a = -1,
+          nb::sig("def split(self, color: int, key: int = -1) -> Group"),
+          R"pbdoc(
+            Split the group to subgroups based on the provided color.
+
+            Processes that use the same color go to the same group. The ``key``
+            argument defines the rank in the new group. The smaller the key the
+            smaller the rank. If the key is negative then the rank in the
+            current group is used.
+
+            Args:
+              color (int): A value to group processes into subgroups.
+              key (int, optional): A key to optionally change the rank ordering
+                of the processes.
+          )pbdoc");
+
+  m.def(
+      "is_available",
+      &distributed::is_available,
+      R"pbdoc(
+      Check if a communication backend is available.
+      )pbdoc");
+
+  m.def(
+      "init",
+      &distributed::init,
+      R"pbdoc(
+        Initialize the communication backend and create the global communication group.
+      )pbdoc");
+
+  m.def(
+      "all_reduce_sum",
+      &distributed::all_reduce_sum,
+      "x"_a,
+      nb::kw_only(),
+      "group"_a = nb::none(),
+      nb::sig(
+          "def all_reduce_sum(x: array, *, group: Optional[Group] = None) -> array"),
+      R"pbdoc(
+        All reduce sum.
+
+        Sum the ``x`` arrays from all processes in the group.
+
+        Args:
+          x (array): Input array.
+          group (Group): The group of processes that will participate in the
+            reduction. If set to ``None`` the global group is used. Default:
+            ``None``.
+
+        Returns:
+          array: The sum of all ``x`` arrays.
+      )pbdoc");
+
+  m.def(
+      "all_gather",
+      &distributed::all_gather,
+      "x"_a,
+      nb::kw_only(),
+      "group"_a = nb::none(),
+      nb::sig(
+          "def all_gather(x: array, *, group: Optional[Group] = None) -> array"),
+      R"pbdoc(
+        Gather arrays from all processes.
+
+        Gather the ``x`` arrays from all processes in the group and concatenate
+        them along the first axis. The arrays should all have the same shape.
+
+        Args:
+          x (array): Input array.
+          group (Group): The group of processes that will participate in the
+            gather. If set to ``None`` the global group is used. Default:
+            ``None``.
+
+        Returns:
+          array: The concatenation of all ``x`` arrays.
+      )pbdoc");
+}
--- a/python/src/mlx.cpp
+++ b/python/src/mlx.cpp
@@ -1,4 +1,4 @@
-// Conbright © 2023-2024 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <nanobind/nanobind.h>

@@ -18,6 +18,7 @@ void init_fft(nb::module_&);
 void init_linalg(nb::module_&);
 void init_constants(nb::module_&);
 void init_fast(nb::module_&);
+void init_distributed(nb::module_&);

 NB_MODULE(core, m) {
  m.doc() = "mlx: A framework for machine learning on Apple silicon.";
@@ -37,6 +38,7 @@ NB_MODULE(core, m) {
  init_linalg(m);
  init_constants(m);
  init_fast(m);
+  init_distributed(m);

  m.attr("__version__") = TOSTRING(_VERSION_);
 }
--- a/python/tests/mpi_test_distributed.py
+++ b/python/tests/mpi_test_distributed.py
@@ -0,0 +1,98 @@
+# Copyright © 2024 Apple Inc.
+
+import unittest
+
+import mlx.core as mx
+import mlx_tests
+
+
+class TestDistributed(mlx_tests.MLXTestCase):
+    def test_groups(self):
+        world = mx.distributed.init()
+        self.assertEqual(world.size(), 8)
+        self.assertTrue(0 <= world.rank() < 8)
+
+        world2 = mx.distributed.init()
+        self.assertEqual(world.size(), world2.size())
+        self.assertEqual(world.rank(), world2.rank())
+
+        sub = world.split(world.rank() % 2)
+        self.assertEqual(sub.size(), 4)
+        self.assertEqual(sub.rank(), world.rank() // 2)
+
+        sub = world.split(world.rank() // 2)
+        self.assertEqual(sub.size(), 2)
+
+    def test_all_reduce(self):
+        world = mx.distributed.init()
+        dtypes = [
+            mx.int8,
+            mx.uint8,
+            mx.int16,
+            mx.uint16,
+            mx.int32,
+            mx.uint32,
+            mx.float32,
+            mx.complex64,
+        ]
+        for dt in dtypes:
+            x = mx.ones((2, 2, 4), dtype=dt)
+            y = mx.distributed.all_reduce_sum(x)
+            self.assertTrue(mx.all(y == world.size()))
+
+        sub = world.split(world.rank() % 2)
+        for dt in dtypes:
+            x = mx.ones((2, 2, 4), dtype=dt)
+            y = mx.distributed.all_reduce_sum(x, group=sub)
+            self.assertTrue(mx.all(y == sub.size()))
+
+    def test_all_gather(self):
+        world = mx.distributed.init()
+        dtypes = [
+            mx.int8,
+            mx.uint8,
+            mx.int16,
+            mx.uint16,
+            mx.int32,
+            mx.uint32,
+            mx.float32,
+            mx.complex64,
+        ]
+        for dt in dtypes:
+            x = mx.ones((2, 2, 4), dtype=dt)
+            y = mx.distributed.all_gather(x)
+            self.assertEqual(y.shape, (world.size() * 2, 2, 4))
+            self.assertTrue(mx.all(y == 1))
+
+        sub = world.split(world.rank() % 2)
+        for dt in dtypes:
+            x = mx.ones((2, 2, 4), dtype=dt)
+            y = mx.distributed.all_gather(x, group=sub)
+            self.assertEqual(y.shape, (sub.size() * 2, 2, 4))
+            self.assertTrue(mx.all(y == 1))
+
+    def test_mixed(self):
+        # Make the following groups:
+        # - world: 0 1 2 3 4 5 6 7
+        # - sub_1: 0 1 0 1 0 1 0 1
+        # - sub_2: 0 0 1 1 2 2 3 3
+        #
+        # The corresponding colors to make them are
+        # - world: N/A
+        # - sub_1: 0 0 1 1 2 2 3 3
+        # - sub_2: 0 1 0 1 0 1 0 1
+
+        world = mx.distributed.init()
+        sub_1 = world.split(world.rank() // 2)
+        sub_2 = world.split(world.rank() % 2)
+
+        x = mx.ones((1, 8)) * world.rank()
+        y = mx.distributed.all_reduce_sum(x, group=sub_1)
+        z = mx.distributed.all_gather(y, group=sub_2)
+        z_target = mx.arange(8).reshape(4, 2).sum(-1, keepdims=True)
+
+        self.assertTrue(mx.all(z == z_target))
+
+
+if __name__ == "__main__":
+    unittest.main()