Change the name to a fun pun

Add headers for gcc
Expose per-backend availability in C++ and python
2025-12-16 01:49:05 +08:00 · 2025-11-20 17:48:23 -08:00 · 2025-11-20 17:31:02 -08:00 · 2025-11-20 15:26:59 -08:00 · 2025-11-20 12:52:35 -08:00 · 2025-11-20 12:36:19 -08:00
19 changed files with 1603 additions and 224 deletions
--- a/.github/actions/build-macos-release/action.yml
+++ b/.github/actions/build-macos-release/action.yml
@@ -17,6 +17,8 @@ runs:
  steps:
    - name: Build Python package
      shell: bash -l {0}
+      env:
+        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
      run: |
        pip install build
        python setup.py clean --all
@@ -25,6 +27,8 @@ runs:
    - name: Build backend package
      if: ${{ inputs.build-backend }}
      shell: bash -l {0}
+      env:
+        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
      run: |
        python setup.py clean --all
        MLX_BUILD_STAGE=2 python -m build -w
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -23,7 +23,7 @@ jobs:

  build_documentation:
    if: github.repository == 'ml-explore/mlx'
-    runs-on: [self-hosted, macos]
+    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/build-docs
@@ -65,14 +65,14 @@ jobs:
        uses: actions/upload-artifact@v5
        with:
          overwrite: true
-          name: linux-wheels-${{ matrix.python_version }}
+          name: linux-wheels-${{ matrix.python_version }}-${{ matrix.arch }}
          path: wheelhouse/mlx-*.whl
      - name: Upload CPU artifacts
        if: matrix.python_version == '3.10'
        uses: actions/upload-artifact@v5
        with:
          overwrite: true
-          name: mlx-cpu
+          name: mlx-cpu-${{ matrix.arch }}
          path: wheelhouse/mlx_cpu-*.whl
  
  build_mac_release:
@@ -208,7 +208,8 @@ jobs:
    steps:
      - uses: actions/download-artifact@v6
        with:
-          name: mlx-cpu
+          pattern: mlx-cpu-*
+          merge-multiple: true
          path: dist
      - name: Display structure of downloaded files
        run: ls -R dist
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,6 +119,10 @@ if(MLX_BUILD_METAL)
    COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
    OUTPUT_VARIABLE MACOS_SDK_VERSION
    OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)
+  execute_process(
+    COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-path"
+    OUTPUT_VARIABLE CMAKE_OSX_SYSROOT
+    OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)

  if(${MACOS_SDK_VERSION} LESS 14.0)
    message(
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -92,7 +92,7 @@ CudaAllocator::CudaAllocator()
          [this](CudaBuffer* buf) { cuda_free(buf); }) {
  size_t free, total;
  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
-  memory_limit_ = total * 0.95;
+  memory_limit_ = total * 0.9;
  max_pool_size_ = memory_limit_;

  int device_count = 0;
@@ -176,7 +176,7 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
  }
  // Copy to managed here if the buffer is not on the right device
-  if (buf->device != device) {
+  if (buf->device >= 0 && buf->device != device) {
    copy_to_managed(*buf);
  }
  return Buffer{buf};
@@ -219,9 +219,9 @@ void CudaAllocator::cuda_free(CudaBuffer* buf) {
    scalar_pool_.free(buf);
  } else {
    if (buf->device >= 0) {
-      cudaFreeAsync(buf->data, free_streams_[buf->device]);
+      CHECK_CUDA_ERROR(cudaFreeAsync(buf->data, free_streams_[buf->device]));
    } else {
-      cudaFree(buf->data);
+      CHECK_CUDA_ERROR(cudaFree(buf->data));
    }
    delete buf;
  }
--- a/mlx/backend/cuda/random.cu
+++ b/mlx/backend/cuda/random.cu
@@ -139,10 +139,10 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  // keys has shape (N1, ..., NK, 2)
  // out has shape (N1, ..., NK, M1, M2, ...)
  auto& keys = inputs[0];
-  uint32_t num_keys = keys.size() / 2;
+  size_t num_keys = keys.size() / 2;

-  uint32_t elems_per_key = out.size() / num_keys;
-  uint32_t bytes_per_key = out.itemsize() * elems_per_key;
+  size_t elems_per_key = out.size() / num_keys;
+  size_t bytes_per_key = out.itemsize() * elems_per_key;
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  out.set_data(cu::malloc_async(out.nbytes(), encoder));
@@ -150,19 +150,25 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
    return;
  }

-  uint32_t out_per_key = (bytes_per_key + 4 - 1) / 4;
-  uint32_t half_size = out_per_key / 2;
+  size_t out_per_key = (bytes_per_key + 4 - 1) / 4;
+  size_t half_size = out_per_key / 2;
+
  bool odd = out_per_key % 2;
+  if ((half_size + odd) >= UINT32_MAX || num_keys >= UINT32_MAX) {
+    throw std::runtime_error("[RandomBits::eval_gpu] Large size unsupported");
+  }

  encoder.set_input_array(keys);
  encoder.set_output_array(out);
-  dim3 grid_dims{num_keys, half_size + odd};
-  int64_t total = grid_dims.x * grid_dims.y;
-  int32_t threads_y = 1;
-  while ((total / threads_y) >= (1U << 31)) {
+  int64_t total = num_keys * (half_size + odd);
+  uint32_t threads_y = 1;
+  while ((total / threads_y) >= UINT_MAX) {
    threads_y *= 2;
  }
-  int32_t threads_x = cuda::ceil_div(total, threads_y);
+  uint32_t threads_x = cuda::ceil_div(total, threads_y);
+
+  dim3 grid_dims{
+      static_cast<uint32_t>(num_keys), static_cast<uint32_t>(half_size + odd)};
  auto [grid, block] = get_grid_and_block(threads_x, threads_y, 1);
  auto& stream = encoder.stream();
  if (keys.flags().row_contiguous) {
--- a/mlx/distributed/CMakeLists.txt
+++ b/mlx/distributed/CMakeLists.txt
@@ -4,6 +4,11 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp)

+if(MLX_BUILD_CPU AND NOT WIN32)
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)
+endif()
+
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/mpi)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ring)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/nccl)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/jaccl)
--- a/mlx/distributed/distributed.cpp
+++ b/mlx/distributed/distributed.cpp
@@ -5,6 +5,7 @@
 #include "mlx/backend/cuda/cuda.h"
 #include "mlx/distributed/distributed.h"
 #include "mlx/distributed/distributed_impl.h"
+#include "mlx/distributed/jaccl/jaccl.h"
 #include "mlx/distributed/mpi/mpi.h"
 #include "mlx/distributed/nccl/nccl.h"
 #include "mlx/distributed/ring/ring.h"
@@ -102,7 +103,27 @@ class EmptyGroup : public GroupImpl {
 } // namespace detail

 bool is_available() {
-  return mpi::is_available() || ring::is_available() || nccl::is_available();
+  return mpi::is_available() || ring::is_available() || nccl::is_available() ||
+      jaccl::is_available();
+}
+
+bool is_available(const std::string& bk) {
+  if (bk == "any") {
+    return is_available();
+  }
+  if (bk == "mpi") {
+    return mpi::is_available();
+  }
+  if (bk == "ring") {
+    return ring::is_available();
+  }
+  if (bk == "nccl") {
+    return nccl::is_available();
+  }
+  if (bk == "jaccl") {
+    return jaccl::is_available();
+  }
+  return false;
 }

 int Group::rank() const {
@@ -135,6 +156,8 @@ Group init(bool strict /* = false */, const std::string& bk /* = "any" */) {
    group = ring::init(strict);
  } else if (bk == "nccl") {
    group = nccl::init(strict);
+  } else if (bk == "jaccl") {
+    group = jaccl::init(strict);
  } else if (bk == "any") {
    if (mlx::core::cu::is_available()) {
      group = nccl::init(false);
@@ -148,13 +171,17 @@ Group init(bool strict /* = false */, const std::string& bk /* = "any" */) {
      group = mpi::init(false);
      bk_ = "mpi";
    }
+    if (group == nullptr) {
+      group = jaccl::init(false);
+      bk_ = "jaccl";
+    }
    if (group == nullptr && strict) {
      throw std::runtime_error("[distributed] Couldn't initialize any backend");
    }
  } else {
    std::ostringstream msg;
-    msg << "[distributed] The only valid values for backend are 'any', 'mpi' "
-        << "and 'ring' but '" << bk << "' was provided.";
+    msg << "[distributed] The only valid values for backend are 'any', 'mpi', 'nccl', "
+        << "'jaccl' and 'ring' but '" << bk << "' was provided.";
    throw std::invalid_argument(msg.str());
  }

--- a/mlx/distributed/distributed.h
+++ b/mlx/distributed/distributed.h
@@ -16,6 +16,7 @@ class GroupImpl;

 /* Check if a communication backend is available */
 bool is_available();
+bool is_available(const std::string& bk);

 /**
 * A distributed::Group represents a group of independent mlx processes that
--- a/mlx/distributed/jaccl/CMakeLists.txt
+++ b/mlx/distributed/jaccl/CMakeLists.txt
@@ -0,0 +1,8 @@
+if(MLX_BUILD_CPU
+   AND ${CMAKE_SYSTEM_NAME} MATCHES "Darwin"
+   AND MACOS_SDK_VERSION GREATER_EQUAL 26.2)
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/jaccl.cpp)
+  target_link_libraries(mlx PRIVATE rdma)
+else()
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/no_jaccl.cpp)
+endif()
--- a/mlx/distributed/jaccl/jaccl.cpp
+++ b/mlx/distributed/jaccl/jaccl.cpp
--- a/mlx/distributed/jaccl/jaccl.h
+++ b/mlx/distributed/jaccl/jaccl.h
@@ -0,0 +1,12 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/distributed/distributed.h"
+
+namespace mlx::core::distributed::jaccl {
+
+using GroupImpl = mlx::core::distributed::detail::GroupImpl;
+
+bool is_available();
+std::shared_ptr<GroupImpl> init(bool strict = false);
+
+} // namespace mlx::core::distributed::jaccl
--- a/mlx/distributed/jaccl/no_jaccl.cpp
+++ b/mlx/distributed/jaccl/no_jaccl.cpp
@@ -0,0 +1,20 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/distributed/jaccl/jaccl.h"
+
+namespace mlx::core::distributed::jaccl {
+
+using GroupImpl = mlx::core::distributed::detail::GroupImpl;
+
+bool is_available() {
+  return false;
+}
+
+std::shared_ptr<GroupImpl> init(bool strict /* = false */) {
+  if (strict) {
+    throw std::runtime_error("Cannot initialize jaccl distributed backend.");
+  }
+  return nullptr;
+}
+
+} // namespace mlx::core::distributed::jaccl
--- a/mlx/distributed/reduction_ops.h
+++ b/mlx/distributed/reduction_ops.h
@@ -0,0 +1,38 @@
+// Copyright © 2025 Apple Inc.
+
+namespace mlx::core::distributed::detail {
+
+template <typename T>
+struct SumOp {
+  void operator()(const T* input, T* output, size_t N) const {
+    while (N-- > 0) {
+      *output += *input;
+      input++;
+      output++;
+    }
+  }
+};
+
+template <typename T>
+struct MaxOp {
+  void operator()(const T* input, T* output, size_t N) const {
+    while (N-- > 0) {
+      *output = std::max(*output, *input);
+      input++;
+      output++;
+    }
+  }
+};
+
+template <typename T>
+struct MinOp {
+  void operator()(const T* input, T* output, size_t N) const {
+    while (N-- > 0) {
+      *output = std::min(*output, *input);
+      input++;
+      output++;
+    }
+  }
+};
+
+} // namespace mlx::core::distributed::detail
--- a/mlx/distributed/ring/ring.cpp
+++ b/mlx/distributed/ring/ring.cpp
@@ -1,9 +1,6 @@
 // Copyright © 2024 Apple Inc.

-#include <arpa/inet.h>
 #include <fcntl.h>
-#include <netdb.h>
-#include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <sys/socket.h>
 #include <unistd.h>
@@ -22,6 +19,8 @@
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/distributed/distributed.h"
 #include "mlx/distributed/distributed_impl.h"
+#include "mlx/distributed/reduction_ops.h"
+#include "mlx/distributed/utils.h"
 #include "mlx/threadpool.h"

 #ifndef SOL_TCP
@@ -94,6 +93,7 @@ constexpr const size_t ALL_SUM_SIZE = 8 * 1024 * 1024;
 constexpr const size_t ALL_SUM_BUFFERS = 2;
 constexpr const int CONN_ATTEMPTS = 5;
 constexpr const int CONN_WAIT = 1000;
+constexpr const char* RING_TAG = "[ring]";

 using GroupImpl = mlx::core::distributed::detail::GroupImpl;
 using json = nlohmann::json;
@@ -296,55 +296,6 @@ class CommunicationThreads {
  std::unordered_map<int, SocketThread> threads_;
 };

-struct address_t {
-  sockaddr_storage addr;
-  socklen_t len;
-
-  const sockaddr* get() const {
-    return (struct sockaddr*)&addr;
-  }
-};
-
-/**
- * Parse a sockaddr from an ip and port provided as strings.
- */
-address_t parse_address(const std::string& ip, const std::string& port) {
-  struct addrinfo hints, *res;
-  memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;
-  hints.ai_socktype = SOCK_STREAM;
-
-  int status = getaddrinfo(ip.c_str(), port.c_str(), &hints, &res);
-  if (status != 0) {
-    std::ostringstream msg;
-    msg << "Can't parse address " << ip << ":" << port;
-    throw std::runtime_error(msg.str());
-  }
-
-  address_t result;
-  memcpy(&result.addr, res->ai_addr, res->ai_addrlen);
-  result.len = res->ai_addrlen;
-  freeaddrinfo(res);
-
-  return result;
-}
-
-/**
- * Parse a sockaddr provided as an <ip>:<port> string.
- */
-address_t parse_address(const std::string& ip_port) {
-  auto colon = ip_port.find(":");
-  if (colon == std::string::npos) {
-    std::ostringstream msg;
-    msg << "Can't parse address " << ip_port;
-    throw std::runtime_error(msg.str());
-  }
-  std::string ip(ip_port.begin(), ip_port.begin() + colon);
-  std::string port(ip_port.begin() + colon + 1, ip_port.end());
-
-  return parse_address(ip, port);
-}
-
 /**
 * Load all addresses from the json hostfile. The hostfile is a list of
 * addresses in order of rank. For each rank there can be many addresses so
@@ -357,15 +308,15 @@ address_t parse_address(const std::string& ip_port) {
 *    ["ip3:5000", "ip3:5001"],
 *  ]
 */
-std::vector<std::vector<address_t>> load_nodes(const char* hostfile) {
-  std::vector<std::vector<address_t>> nodes;
+std::vector<std::vector<detail::address_t>> load_nodes(const char* hostfile) {
+  std::vector<std::vector<detail::address_t>> nodes;
  std::ifstream f(hostfile);

  json hosts = json::parse(f);
  for (auto& h : hosts) {
-    std::vector<address_t> host;
+    std::vector<detail::address_t> host;
    for (auto& ips : h) {
-      host.push_back(parse_address(ips.get<std::string>()));
+      host.push_back(std::move(detail::parse_address(ips.get<std::string>())));
    }
    nodes.push_back(std::move(host));
  }
@@ -377,73 +328,15 @@ std::vector<std::vector<address_t>> load_nodes(const char* hostfile) {
 * Create a socket and accept one connection for each of the provided
 * addresses.
 */
-std::vector<int> accept_connections(const std::vector<address_t>& addresses) {
+std::vector<int> accept_connections(
+    const std::vector<detail::address_t>& addresses) {
  std::vector<int> sockets;
  int success;

  for (auto& address : addresses) {
-    // Create the socket to wait for connections from the peers
-    int sock = socket(AF_INET, SOCK_STREAM, 0);
-    if (sock < 0) {
-      std::ostringstream msg;
-      msg << "[ring] Couldn't create socket (error: " << errno << ")";
-      throw std::runtime_error(msg.str());
-    }
-
-    // Make sure we can launch immediately after shutdown by setting the
-    // reuseaddr option so that we don't get address already in use errors
-    int enable = 1;
-    success = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
-    if (success < 0) {
-      shutdown(sock, 2);
-      close(sock);
-      std::ostringstream msg;
-      msg << "[ring] Couldn't enable reuseaddr (error: " << errno << ")";
-      throw std::runtime_error(msg.str());
-    }
-    success = setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, &enable, sizeof(int));
-    if (success < 0) {
-      shutdown(sock, 2);
-      close(sock);
-      std::ostringstream msg;
-      msg << "[ring] Couldn't enable reuseport (error: " << errno << ")";
-      throw std::runtime_error(msg.str());
-    }
-
-    // Bind the socket to the address and port
-    success = bind(sock, address.get(), address.len);
-    if (success < 0) {
-      shutdown(sock, 2);
-      close(sock);
-      std::ostringstream msg;
-      msg << "[ring] Couldn't bind socket (error: " << errno << ")";
-      throw std::runtime_error(msg.str());
-    }
-
-    // Wait for connections
-    success = listen(sock, 0);
-    if (success < 0) {
-      shutdown(sock, 2);
-      close(sock);
-      std::ostringstream msg;
-      msg << "[ring] Couldn't listen (error: " << errno << ")";
-      throw std::runtime_error(msg.str());
-    }
-
-    int peer_socket = accept(sock, nullptr, nullptr);
-    if (peer_socket < 0) {
-      shutdown(sock, 2);
-      close(sock);
-      std::ostringstream msg;
-      msg << "[ring] Accept failed (error: " << errno << ")";
-      throw std::runtime_error(msg.str());
-    }
-
-    // Close the listening socket
-    shutdown(sock, 2);
-    close(sock);
-
-    sockets.push_back(peer_socket);
+    detail::TCPSocket socket(RING_TAG);
+    socket.listen(RING_TAG, address);
+    sockets.push_back(socket.accept(RING_TAG).detach());
  }

  return sockets;
@@ -454,93 +347,42 @@ std::vector<int> accept_connections(const std::vector<address_t>& addresses) {
 * provided addresses.
 */
 std::vector<int> make_connections(
-    const std::vector<address_t>& addresses,
+    const std::vector<detail::address_t>& addresses,
    bool verbose) {
  std::vector<int> sockets;
  int success;

  for (auto& address : addresses) {
-    int sock;
-
-    // Attempt to connect to the peer CONN_ATTEMPTS times with exponential
-    // backoff. TODO: Do we need that?
-    for (int attempt = 0; attempt < CONN_ATTEMPTS; attempt++) {
-      // Create the socket
-      sock = socket(AF_INET, SOCK_STREAM, 0);
-      if (sock < 0) {
-        std::ostringstream msg;
-        msg << "[ring] Couldn't create socket (error: " << errno << ")";
-        throw std::runtime_error(msg.str());
-      }
-
-      if (attempt > 0) {
-        int wait = (1 << (attempt - 1)) * CONN_WAIT;
-        log_info(
-            verbose,
-            "Attempt",
-            attempt,
-            "wait",
-            wait,
-            "ms (error:",
-            errno,
-            ")");
-        std::this_thread::sleep_for(std::chrono::milliseconds(wait));
-      }
-
-      success = connect(sock, address.get(), address.len);
-      if (success == 0) {
-        break;
-      }
-    }
-    if (success < 0) {
-      std::ostringstream msg;
-      msg << "[ring] Couldn't connect (error: " << errno << ")";
-      throw std::runtime_error(msg.str());
-    }
-
-    sockets.push_back(sock);
+    sockets.push_back(detail::TCPSocket::connect(
+                          RING_TAG,
+                          address,
+                          CONN_ATTEMPTS,
+                          CONN_WAIT,
+                          [verbose](int attempt, int wait) {
+                            log_info(
+                                verbose,
+                                "Attempt",
+                                attempt,
+                                "waiting",
+                                wait,
+                                "ms (error:",
+                                errno,
+                                ")");
+                          })
+                          .detach());
  }

  return sockets;
 }
-template <typename T>
-struct SumOp {
-  void operator()(const T* input, T* output, size_t N) {
-    while (N-- > 0) {
-      *output += *input;
-      input++;
-      output++;
-    }
-  }
-};
-
-template <typename T>
-struct MaxOp {
-  void operator()(const T* input, T* output, size_t N) {
-    while (N-- > 0) {
-      *output = std::max(*output, *input);
-      input++;
-      output++;
-    }
-  }
-};
-
-template <typename T>
-struct MinOp {
-  void operator()(const T* input, T* output, size_t N) {
-    while (N-- > 0) {
-      *output = std::min(*output, *input);
-      input++;
-      output++;
-    }
-  }
-};

 } // namespace

 class RingGroup : public GroupImpl {
 public:
-  RingGroup(int rank, std::vector<std::vector<address_t>> nodes, bool verbose)
+  RingGroup(
+      int rank,
+      std::vector<std::vector<detail::address_t>> nodes,
+      bool verbose)
      : rank_(rank), verbose_(verbose), pool_(0) {
    if (rank_ > 0 && rank_ >= nodes.size()) {
      throw std::runtime_error(
@@ -633,17 +475,17 @@ class RingGroup : public GroupImpl {

  void all_sum(const array& input, array& output, Stream stream) override {
    SWITCH_TYPE(
-        output, all_reduce<T, SumOp<T>>(input, output, stream, SumOp<T>()));
+        output, all_reduce<T>(input, output, stream, detail::SumOp<T>()));
  }

  void all_max(const array& input, array& output, Stream stream) override {
    SWITCH_TYPE(
-        output, all_reduce<T, MaxOp<T>>(input, output, stream, MaxOp<T>()));
+        output, all_reduce<T>(input, output, stream, detail::MaxOp<T>()));
  }

  void all_min(const array& input, array& output, Stream stream) override {
    SWITCH_TYPE(
-        output, all_reduce<T, MinOp<T>>(input, output, stream, MinOp<T>()));
+        output, all_reduce<T>(input, output, stream, detail::MinOp<T>()));
  }

  std::shared_ptr<GroupImpl> split(int color, int key = -1) override {
--- a/mlx/distributed/utils.cpp
+++ b/mlx/distributed/utils.cpp
@@ -0,0 +1,204 @@
+// Copyright © 2025 Apple Inc.
+
+#include <netdb.h>
+#include <unistd.h>
+#include <cstring>
+#include <sstream>
+#include <thread>
+
+#include "mlx/distributed/utils.h"
+
+namespace mlx::core::distributed::detail {
+
+/**
+ * Parse a sockaddr from an ip and port provided as strings.
+ */
+address_t parse_address(const std::string& ip, const std::string& port) {
+  struct addrinfo hints, *res;
+  std::memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;
+  hints.ai_socktype = SOCK_STREAM;
+
+  int status = getaddrinfo(ip.c_str(), port.c_str(), &hints, &res);
+  if (status != 0) {
+    std::ostringstream msg;
+    msg << "Can't parse address " << ip << ":" << port;
+    throw std::runtime_error(msg.str());
+  }
+
+  address_t result;
+  memcpy(&result.addr, res->ai_addr, res->ai_addrlen);
+  result.len = res->ai_addrlen;
+  freeaddrinfo(res);
+
+  return result;
+}
+
+/**
+ * Parse a sockaddr provided as an <ip>:<port> string.
+ */
+address_t parse_address(const std::string& ip_port) {
+  auto colon = ip_port.find(":");
+  if (colon == std::string::npos) {
+    std::ostringstream msg;
+    msg << "Can't parse address " << ip_port;
+    throw std::runtime_error(msg.str());
+  }
+  std::string ip(ip_port.begin(), ip_port.begin() + colon);
+  std::string port(ip_port.begin() + colon + 1, ip_port.end());
+
+  return parse_address(ip, port);
+}
+
+TCPSocket::TCPSocket(const char* tag) {
+  sock_ = socket(AF_INET, SOCK_STREAM, 0);
+  if (sock_ < 0) {
+    std::ostringstream msg;
+    msg << tag << " Couldn't create socket (error: " << errno << ")";
+    throw std::runtime_error(msg.str());
+  }
+}
+
+TCPSocket::TCPSocket(TCPSocket&& s) {
+  sock_ = s.sock_;
+  s.sock_ = -1;
+}
+
+TCPSocket& TCPSocket::operator=(TCPSocket&& s) {
+  if (this != &s) {
+    sock_ = s.sock_;
+    s.sock_ = -1;
+  }
+  return *this;
+}
+
+TCPSocket::TCPSocket(int s) : sock_(s) {}
+
+TCPSocket::~TCPSocket() {
+  if (sock_ > 0) {
+    shutdown(sock_, 2);
+    close(sock_);
+  }
+}
+
+int TCPSocket::detach() {
+  int s = sock_;
+  sock_ = -1;
+  return s;
+}
+
+void TCPSocket::listen(const char* tag, const address_t& addr) {
+  int success;
+
+  // Make sure we can launch immediately after shutdown by setting the
+  // reuseaddr option so that we don't get address already in use errors
+  int enable = 1;
+  success = setsockopt(sock_, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
+  if (success < 0) {
+    std::ostringstream msg;
+    msg << tag << " Couldn't enable reuseaddr (error: " << errno << ")";
+    throw std::runtime_error(msg.str());
+  }
+  success = setsockopt(sock_, SOL_SOCKET, SO_REUSEPORT, &enable, sizeof(int));
+  if (success < 0) {
+    std::ostringstream msg;
+    msg << tag << " Couldn't enable reuseport (error: " << errno << ")";
+    throw std::runtime_error(msg.str());
+  }
+
+  // Bind the socket to the address and port
+  success = bind(sock_, addr.get(), addr.len);
+  if (success < 0) {
+    std::ostringstream msg;
+    msg << tag << " Couldn't bind socket (error: " << errno << ")";
+    throw std::runtime_error(msg.str());
+  }
+
+  // Prepare waiting for connections
+  success = ::listen(sock_, 0);
+  if (success < 0) {
+    std::ostringstream msg;
+    msg << tag << " Couldn't listen (error: " << errno << ")";
+    throw std::runtime_error(msg.str());
+  }
+}
+
+TCPSocket TCPSocket::accept(const char* tag) {
+  int peer = ::accept(sock_, nullptr, nullptr);
+  if (peer < 0) {
+    std::ostringstream msg;
+    msg << tag << " Accept failed (error: " << errno << ")";
+    throw std::runtime_error(msg.str());
+  }
+
+  return TCPSocket(peer);
+}
+
+void TCPSocket::send(const char* tag, const void* data, size_t len) {
+  while (len > 0) {
+    auto n = ::send(sock_, data, len, 0);
+    if (n <= 0) {
+      std::ostringstream msg;
+      msg << tag << " Send failed with errno=" << errno;
+      throw std::runtime_error(msg.str());
+    }
+    len -= n;
+    data = static_cast<const char*>(data) + n;
+  }
+}
+
+void TCPSocket::recv(const char* tag, void* data, size_t len) {
+  while (len > 0) {
+    auto n = ::recv(sock_, data, len, 0);
+    if (n <= 0) {
+      std::ostringstream msg;
+      msg << tag << " Recv failed with errno=" << errno;
+      throw std::runtime_error(msg.str());
+    }
+    len -= n;
+    data = static_cast<char*>(data) + n;
+  }
+}
+
+TCPSocket TCPSocket::connect(
+    const char* tag,
+    const address_t& addr,
+    int num_retries,
+    int wait,
+    std::function<void(int, int)> cb) {
+  int sock, success;
+
+  // Attempt to connect `num_retries` times with exponential backoff.
+  for (int attempt = 0; attempt < num_retries; attempt++) {
+    // Create the socket
+    sock = socket(AF_INET, SOCK_STREAM, 0);
+    if (sock < 0) {
+      std::ostringstream msg;
+      msg << tag << " Couldn't create socket to connect (error: " << errno
+          << ")";
+      throw std::runtime_error(msg.str());
+    }
+
+    success = ::connect(sock, addr.get(), addr.len);
+    if (success == 0) {
+      break;
+    }
+
+    cb(attempt, wait);
+    if (wait > 0) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(wait));
+    }
+
+    wait <<= 1;
+  }
+
+  if (success < 0) {
+    std::ostringstream msg;
+    msg << tag << " Couldn't connect (error: " << errno << ")";
+    throw std::runtime_error(msg.str());
+  }
+
+  return TCPSocket(sock);
+}
+
+} // namespace mlx::core::distributed::detail
--- a/mlx/distributed/utils.h
+++ b/mlx/distributed/utils.h
@@ -0,0 +1,67 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <sys/socket.h>
+#include <functional>
+#include <string>
+
+namespace mlx::core::distributed::detail {
+
+struct address_t {
+  sockaddr_storage addr;
+  socklen_t len;
+
+  const sockaddr* get() const {
+    return (struct sockaddr*)&addr;
+  }
+};
+
+/**
+ * Parse a sockaddr from an ip and port provided as strings.
+ */
+address_t parse_address(const std::string& ip, const std::string& port);
+
+/**
+ * Parse a sockaddr provided as an <ip>:<port> string.
+ */
+address_t parse_address(const std::string& ip_port);
+
+/**
+ * Small wrapper over a TCP socket to simplify initiating connections.
+ */
+class TCPSocket {
+ public:
+  TCPSocket(const char* tag);
+  TCPSocket(const TCPSocket&) = delete;
+  TCPSocket& operator=(const TCPSocket&) = delete;
+  TCPSocket(TCPSocket&& s);
+  TCPSocket& operator=(TCPSocket&&);
+  ~TCPSocket();
+
+  void listen(const char* tag, const address_t& addr);
+  TCPSocket accept(const char* tag);
+
+  void send(const char* tag, const void* data, size_t len);
+  void recv(const char* tag, void* data, size_t len);
+
+  int detach();
+
+  operator int() const {
+    return sock_;
+  }
+
+  static TCPSocket connect(
+      const char* tag,
+      const address_t& addr,
+      int num_retries = 1,
+      int wait = 0,
+      std::function<void(int, int)> cb = nullptr);
+
+ private:
+  TCPSocket(int sock);
+
+  int sock_;
+};
+
+} // namespace mlx::core::distributed::detail
--- a/mlx/version.h
+++ b/mlx/version.h
@@ -4,7 +4,7 @@

 #define MLX_VERSION_MAJOR 0
 #define MLX_VERSION_MINOR 30
-#define MLX_VERSION_PATCH 0
+#define MLX_VERSION_PATCH 1
 #define MLX_VERSION_NUMERIC \
  (100000 * MLX_VERSION_MAJOR + 1000 * MLX_VERSION_MINOR + MLX_VERSION_PATCH)

--- a/python/src/distributed.cpp
+++ b/python/src/distributed.cpp
@@ -52,9 +52,25 @@ void init_distributed(nb::module_& parent_module) {

  m.def(
      "is_available",
-      &mx::distributed::is_available,
+      [](const std::string& backend) {
+        return mx::distributed::is_available(backend);
+      },
+      "backend"_a = "any",
+      nb::sig("def is_available(backend: str = 'any') -> bool"),
      R"pbdoc(
      Check if a communication backend is available.
+
+      Note, this function returns whether MLX has the capability of
+      instantiating that distributed backend not whether it is possible to
+      create a communication group. For that purpose one should use
+      ``init(strict=True)``.
+
+      Args:
+        backend (str, optional): The name of the backend to check for availability.
+          It takes the same values as ``init()``. Default: ``any``.
+
+      Returns:
+        bool: Whether the distributed backend is available.
      )pbdoc");

  m.def(
@@ -79,10 +95,10 @@ void init_distributed(nb::module_& parent_module) {
            in case ``mx.distributed.is_available()`` returns False otherwise
            it throws a runtime error. Default: ``False``
          backend (str, optional): Which distributed backend to initialize.
-            Possible values ``mpi``, ``ring``, ``nccl``, ``any``. If set to ``any`` all
-            available backends are tried and the first one that succeeds
-            becomes the global group which will be returned in subsequent
-            calls. Default: ``any``
+            Possible values ``mpi``, ``ring``, ``nccl``, ``jaccl``, ``any``. If
+            set to ``any`` all available backends are tried and the first one
+            that succeeds becomes the global group which will be returned in
+            subsequent calls. Default: ``any``

        Returns:
          Group: The group representing all the launched processes.
--- a/setup.py
+++ b/setup.py
@@ -24,8 +24,8 @@ def get_version():
            if "#define MLX_VERSION_PATCH" in l:
                patch = l.split()[-1]
    version = f"{major}.{minor}.{patch}"
-    pypi_release = os.environ.get("PYPI_RELEASE", False)
-    dev_release = os.environ.get("DEV_RELEASE", False)
+    pypi_release = int(os.environ.get("PYPI_RELEASE", 0))
+    dev_release = int(os.environ.get("DEV_RELEASE", 0))
    if not pypi_release or dev_release:
        today = datetime.date.today()
        version = f"{version}.dev{today.year}{today.month:02d}{today.day:02d}"
Author	SHA1	Message	Date
Angelos Katharopoulos	bf9456f6cc	Change the name to a fun pun	2025-11-20 17:48:23 -08:00
Angelos Katharopoulos	704f81c03d	Add headers for gcc	2025-11-20 17:31:02 -08:00
Angelos Katharopoulos	df6b23156f	Expose per-backend availability in C++ and python	2025-11-20 15:26:59 -08:00
Angelos Katharopoulos	7a82455b35	Add a no_ibv	2025-11-20 12:52:35 -08:00
Angelos Katharopoulos	643a9a6ba6	Add empty sum_scatter	2025-11-20 12:36:19 -08:00
Angelos Katharopoulos	82097a8f85	Add send/recv	2025-11-20 12:36:19 -08:00
Angelos Katharopoulos	29d9cd836a	Make sure that there is space for work completions	2025-11-20 12:36:19 -08:00
Angelos Katharopoulos	2d10020178	Add working reduce and semi-working all gather	2025-11-20 12:36:19 -08:00
Angelos Katharopoulos	031e62539a	Fix ring	2025-11-20 12:36:19 -08:00
Angelos Katharopoulos	97f74543b1	Fix side channel initialization for more than 2 peers	2025-11-20 12:36:19 -08:00
Angelos Katharopoulos	0dbe63397d	All gather	2025-11-20 12:36:19 -08:00
Angelos Katharopoulos	873df2e0f7	Initial working all reduce	2025-11-20 12:36:16 -08:00
Awni Hannun	0d68efd461	patch bump for future version (#2804 ) Some checks failed Build and Test / check_lint (push) Has been cancelled Details Build and Test / linux_build_and_test (ubuntu-22.04) (push) Has been cancelled Details Build and Test / linux_build_and_test (ubuntu-22.04-arm) (push) Has been cancelled Details Build and Test / mac_build_and_test (14.0) (push) Has been cancelled Details Build and Test / mac_build_and_test (15.0) (push) Has been cancelled Details Build and Test / cuda_build_and_test (cuda-12.6) (push) Has been cancelled Details Build and Test / cuda_build_and_test (cuda-12.9) (push) Has been cancelled Details Build and Test / build_documentation (push) Has been cancelled Details Build and Test / Linux Fedora CPP Build (aarch64) (push) Has been cancelled Details Build and Test / Linux Fedora CPP Build (x86_64) (push) Has been cancelled Details Nightly Build / build_linux_release (3.10) (push) Has been cancelled Details Nightly Build / build_linux_release (3.14) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.11, ubuntu-22.04) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.11, ubuntu-22.04-arm) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.12, ubuntu-22.04) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.12, ubuntu-22.04-arm) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.13, ubuntu-22.04) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.13, ubuntu-22.04-arm) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.14, ubuntu-22.04) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.14, ubuntu-22.04-arm) (push) Has been cancelled Details Nightly Build / build_mac_release (3.10) (push) Has been cancelled Details Nightly Build / build_mac_release (3.13) (push) Has been cancelled Details Nightly Build / build_cuda_release (push) Has been cancelled Details	2025-11-20 09:26:20 -08:00
Awni Hannun	f9e1a14135	[CUDA] Partly fix random for large sizes (#2798 )	2025-11-20 07:27:50 -08:00
Awni Hannun	d8e9ded928	Fix cuda allocator copy condition (#2800 )	2025-11-20 07:06:55 -08:00
Awni Hannun	60939d010c	Fix macos release target and linux arm release (#2802 ) Some checks failed Build and Test / check_lint (push) Has been cancelled Details Build and Test / linux_build_and_test (ubuntu-22.04) (push) Has been cancelled Details Build and Test / linux_build_and_test (ubuntu-22.04-arm) (push) Has been cancelled Details Build and Test / mac_build_and_test (14.0) (push) Has been cancelled Details Build and Test / mac_build_and_test (15.0) (push) Has been cancelled Details Build and Test / cuda_build_and_test (cuda-12.6) (push) Has been cancelled Details Build and Test / cuda_build_and_test (cuda-12.9) (push) Has been cancelled Details Build and Test / build_documentation (push) Has been cancelled Details Build and Test / Linux Fedora CPP Build (aarch64) (push) Has been cancelled Details Build and Test / Linux Fedora CPP Build (x86_64) (push) Has been cancelled Details Nightly Build / build_linux_release (3.10) (push) Has been cancelled Details Nightly Build / build_linux_release (3.14) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.11, ubuntu-22.04) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.11, ubuntu-22.04-arm) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.12, ubuntu-22.04) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.12, ubuntu-22.04-arm) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.13, ubuntu-22.04) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.13, ubuntu-22.04-arm) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.14, ubuntu-22.04) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.14, ubuntu-22.04-arm) (push) Has been cancelled Details Nightly Build / build_mac_release (3.10) (push) Has been cancelled Details Nightly Build / build_mac_release (3.13) (push) Has been cancelled Details Nightly Build / build_cuda_release (push) Has been cancelled Details	2025-11-19 21:37:50 -08:00
Awni Hannun	fdcd2923fd	patch + fix docs build (#2799 )	2025-11-19 16:16:26 -08:00