From 0792ff02ff863106cf22076862909abc1f611c31 Mon Sep 17 00:00:00 2001 From: Angelos Katharopoulos Date: Wed, 5 Mar 2025 13:16:19 -0800 Subject: [PATCH 1/3] Only fail when 10 consecutive socket errors occur (#1928) --- mlx/distributed/ring/ring.cpp | 12 ++++++++++-- python/mlx/distributed_run.py | 7 ++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/mlx/distributed/ring/ring.cpp b/mlx/distributed/ring/ring.cpp index 1f1c1b0b6..5bf08200e 100644 --- a/mlx/distributed/ring/ring.cpp +++ b/mlx/distributed/ring/ring.cpp @@ -199,6 +199,7 @@ class SocketThread { } void worker() { + int error_count = 0; bool delete_recv = false; bool delete_send = false; while (true) { @@ -235,10 +236,11 @@ class SocketThread { task.buffer = static_cast(task.buffer) + r; task.size -= r; delete_recv = task.size == 0; + error_count = 0; } else if (errno != EAGAIN) { + error_count++; log_info( true, "Receiving from socket", fd_, "failed with errno", errno); - return; } } if (!sends_.empty()) { @@ -248,11 +250,17 @@ class SocketThread { task.buffer = static_cast(task.buffer) + r; task.size -= r; delete_send = task.size == 0; + error_count = 0; } else if (errno != EAGAIN) { + error_count++; log_info(true, "Sending to socket", fd_, "failed with errno", errno); - return; } } + + if (error_count >= 10) { + log_info(true, "Too many send/recv errors. Aborting..."); + return; + } } } diff --git a/python/mlx/distributed_run.py b/python/mlx/distributed_run.py index 1a749beed..5d6bc4383 100644 --- a/python/mlx/distributed_run.py +++ b/python/mlx/distributed_run.py @@ -112,7 +112,12 @@ def extract_rings(hosts, index): break if not ring: break - rings.append(normalize(concretize(ring, used_ports))) + try: + rings.append(normalize(concretize(ring, used_ports))) + except RuntimeError: + if len(rings) > 0: + return rings + raise return rings From f599c11bc874a81b15a5c030257802e6e25f28d3 Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Wed, 5 Mar 2025 13:16:53 -0800 Subject: [PATCH 2/3] bump (#1931) --- mlx/version.h | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlx/version.h b/mlx/version.h index 35b026149..f244dcb16 100644 --- a/mlx/version.h +++ b/mlx/version.h @@ -3,7 +3,7 @@ #pragma once #define MLX_VERSION_MAJOR 0 -#define MLX_VERSION_MINOR 24 +#define MLX_VERSION_MINOR 23 #define MLX_VERSION_PATCH 2 #define MLX_VERSION_NUMERIC \ (100000 * MLX_VERSION_MAJOR + 1000 * MLX_VERSION_MINOR + MLX_VERSION_PATCH) diff --git a/setup.py b/setup.py index d4b5e15dd..787e0545b 100644 --- a/setup.py +++ b/setup.py @@ -173,7 +173,7 @@ if __name__ == "__main__": setup( name="mlx", - version=get_version("0.23.1"), + version=get_version("0.23.2"), author="MLX Contributors", author_email="mlx@group.apple.com", description="A framework for machine learning on Apple silicon.", From 85b34d59bcea059603cd15db7ceaceed885113a0 Mon Sep 17 00:00:00 2001 From: Chunyang Wen Date: Thu, 6 Mar 2025 05:48:03 +0800 Subject: [PATCH 3/3] Clean unused sys (#1929) --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 787e0545b..72bc2dba3 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,6 @@ import os import platform import re import subprocess -import sys from pathlib import Path from subprocess import run