mirror of
https://github.com/ml-explore/mlx.git
synced 2025-06-24 09:21:16 +08:00
Only fail when 10 consecutive socket errors occur (#1928)
This commit is contained in:
parent
fd0d63ba5b
commit
0792ff02ff
@ -199,6 +199,7 @@ class SocketThread {
|
||||
}
|
||||
|
||||
void worker() {
|
||||
int error_count = 0;
|
||||
bool delete_recv = false;
|
||||
bool delete_send = false;
|
||||
while (true) {
|
||||
@ -235,10 +236,11 @@ class SocketThread {
|
||||
task.buffer = static_cast<char*>(task.buffer) + r;
|
||||
task.size -= r;
|
||||
delete_recv = task.size == 0;
|
||||
error_count = 0;
|
||||
} else if (errno != EAGAIN) {
|
||||
error_count++;
|
||||
log_info(
|
||||
true, "Receiving from socket", fd_, "failed with errno", errno);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (!sends_.empty()) {
|
||||
@ -248,11 +250,17 @@ class SocketThread {
|
||||
task.buffer = static_cast<char*>(task.buffer) + r;
|
||||
task.size -= r;
|
||||
delete_send = task.size == 0;
|
||||
error_count = 0;
|
||||
} else if (errno != EAGAIN) {
|
||||
error_count++;
|
||||
log_info(true, "Sending to socket", fd_, "failed with errno", errno);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (error_count >= 10) {
|
||||
log_info(true, "Too many send/recv errors. Aborting...");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -112,7 +112,12 @@ def extract_rings(hosts, index):
|
||||
break
|
||||
if not ring:
|
||||
break
|
||||
rings.append(normalize(concretize(ring, used_ports)))
|
||||
try:
|
||||
rings.append(normalize(concretize(ring, used_ports)))
|
||||
except RuntimeError:
|
||||
if len(rings) > 0:
|
||||
return rings
|
||||
raise
|
||||
|
||||
return rings
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user