NCCL backend (#2476)

2025-12-16 01:49:05 +08:00 · 2025-08-21 20:56:15 +02:00
parent e843c4d8d5
commit 9392fc3f88
21 changed files with 897 additions and 20 deletions
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -22,6 +22,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/cudnn_utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/custom_kernel.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
--- a/mlx/backend/cuda/distributed.cu
+++ b/mlx/backend/cuda/distributed.cu
@@ -0,0 +1,51 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/distributed/primitives.h"
+#include "mlx/primitives.h"
+
+#include <cassert>
+
+namespace mlx::core {
+namespace distributed {
+void AllReduce::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 1);
+  assert(outputs.size() == 1);
+
+  auto& input = inputs[0];
+  auto& output = outputs[0];
+
+  auto& encoder = cu::get_command_encoder(stream());
+
+  if (input.is_donatable()) {
+    output.copy_shared_buffer(input);
+  } else {
+    output.set_data(allocator::malloc(output.nbytes()));
+  }
+
+  encoder.set_input_array(input);
+  encoder.set_output_array(output);
+
+  auto capture = encoder.capture_context();
+  auto& s = stream();
+
+  switch (reduce_type_) {
+    case Sum:
+      distributed::detail::all_sum(group(), input, output, s);
+      break;
+    case Max:
+      distributed::detail::all_max(group(), input, output, s);
+      break;
+    case Min:
+      distributed::detail::all_min(group(), input, output, s);
+      break;
+    default:
+      throw std::runtime_error(
+          "Only all reduce sum, max, and min are supported.");
+  }
+}
+} // namespace distributed
+} // namespace mlx::core
--- a/mlx/backend/cuda/primitives.cpp
+++ b/mlx/backend/cuda/primitives.cpp
@@ -42,7 +42,6 @@ NO_GPU_MULTI(Eig)
 NO_GPU_MULTI(Eigh)

 namespace distributed {
-NO_GPU_MULTI(AllReduce)
 NO_GPU_MULTI(AllGather)
 NO_GPU_MULTI(Send)
 NO_GPU_MULTI(Recv)