[CUDA] Update calls to cudaMemAdvise and cudaGraphAddDependencies for CUDA 13 (#2525 )

* [CUDA] Update cudaMemAdvise and cudaGraphAddDependencies for CUDA 13 These functions' signatures changed in CUDA 13, so we differentiate between CUDA 13 and preceding releases at compile time. * Mention NVIDIA in ACKNOWLEDGMENTS.md
Fix warning 186-D from nvcc (#2527 )
2025-12-16 01:49:05 +08:00 · 2025-08-21 19:57:20 -07:00 · 2025-08-22 10:29:55 +09:00 · 2025-08-21 17:57:49 -07:00
7 changed files with 33 additions and 9 deletions
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -25,6 +25,11 @@ MLX was developed with contributions from the following individuals:
  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
 </a>

+# Organizations
+
+MLX has received contributions from the following companies:
+- NVIDIA Corporation & Affiliates
+
 # Third-Party Software

 MLX leverages several third-party software, listed here together with
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -30,8 +30,15 @@ SmallSizePool::SmallSizePool() {
  next_free_ = buffer_;

  CHECK_CUDA_ERROR(cudaMallocManaged(&data_, small_pool_size));
+#if CUDART_VERSION >= 13000
+  cudaMemLocation loc;
+  loc.type = cudaMemLocationTypeDevice;
+  loc.id = 0;
+#else
+  int loc = 0;
+#endif // CUDART_VERSION >= 13000
  CHECK_CUDA_ERROR(
-      cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetReadMostly, 0));
+      cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetReadMostly, loc));

  auto curr = next_free_;
  for (size_t i = 1; i < num_blocks; ++i) {
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -269,7 +269,13 @@ void CommandEncoder::commit() {
  if (node_count_ > 0) {
    if (!from_nodes_.empty()) {
      CHECK_CUDA_ERROR(cudaGraphAddDependencies(
-          graph_, from_nodes_.data(), to_nodes_.data(), from_nodes_.size()));
+          graph_,
+          from_nodes_.data(),
+          to_nodes_.data(),
+#if CUDART_VERSION >= 13000
+          nullptr, // edgeData
+#endif // CUDART_VERSION >= 13000
+          from_nodes_.size()));
    }

    graph_key_ += ".";
--- a/mlx/backend/cuda/device/binary_ops.cuh
+++ b/mlx/backend/cuda/device/binary_ops.cuh
@@ -205,8 +205,10 @@ struct Power {
    if constexpr (cuda::std::is_integral_v<T>) {
      T res = 1;
      // Raising an integer to a negative power is undefined
-      if (exp < 0) {
-        return 0;
+      if constexpr (cuda::std::is_signed_v<T>) {
+        if (exp < 0) {
+          return 0;
+        }
      }
      while (exp) {
        if (exp & 1) {
--- a/python/mlx/distributed_run.py
+++ b/python/mlx/distributed_run.py
@@ -20,6 +20,8 @@ from select import select
 from subprocess import PIPE, Popen, run
 from typing import Optional

+import mlx.core as mx
+

@dataclass
 class Host:
@@ -437,8 +439,8 @@ def launch_nccl(parser, hosts, args, command):
    try:
        for rank in range(world_size):
            env = base_env.copy()
-            env["MLX_RANK"] = str(rank)
-            env["CUDA_VISIBLE_DEVICES"] = str(rank % args.nproc_per_node)
+            env["MLX_RANK"] = str(rank % args.repeat_hosts)
+            env["CUDA_VISIBLE_DEVICES"] = str(rank % args.repeat_hosts)
            p = Popen(command, env=env)
            procs.append(p)

@@ -708,7 +710,7 @@ def distributed_config():
    parser.add_argument(
        "--backend",
        choices=["ring", "mpi", "nccl"],
-        default="ring",
+        default="nccl" if mx.cuda.is_available() else "ring",
        help="Which distributed backend to configure",
    )
    parser.add_argument(
@@ -780,7 +782,7 @@ def main():
    parser.add_argument(
        "--backend",
        choices=["ring", "mpi", "nccl"],
-        default="ring",
+        default="nccl" if mx.cuda.is_available() else "ring",
        help="Which distributed backend to launch",
    )
    parser.add_argument(
--- a/python/scripts/repair_cuda.sh
+++ b/python/scripts/repair_cuda.sh
@@ -6,6 +6,7 @@ auditwheel repair dist/* \
  --exclude libnvrtc* \
  --exclude libcuda* \
  --exclude libcudnn* \
+  --exclude libnccl* \
  -w wheel_tmp


@@ -17,7 +18,7 @@ rm "${repaired_wheel}"
 mlx_so="mlx/lib/libmlx.so"
 rpath=$(patchelf --print-rpath "${mlx_so}")
 base="\$ORIGIN/../../nvidia"
-rpath=$rpath:${base}/cublas/lib:${base}/cuda_nvrtc/lib:${base}/cudnn/lib
+rpath=$rpath:${base}/cublas/lib:${base}/cuda_nvrtc/lib:${base}/cudnn/lib:${base}/nccl/lib
 patchelf --force-rpath --set-rpath "$rpath" "$mlx_so"
 python ../python/scripts/repair_record.py ${mlx_so}

--- a/setup.py
+++ b/setup.py
@@ -297,6 +297,7 @@ if __name__ == "__main__":
                "nvidia-cublas-cu12==12.9.*",
                "nvidia-cuda-nvrtc-cu12==12.9.*",
                "nvidia-cudnn-cu12==9.*",
+                "nvidia-nccl-cu12",
            ]
        else:
            name = "mlx-cpu"
Author	SHA1	Message	Date
Andrey Portnoy	5722c147de	[CUDA] Update calls to `cudaMemAdvise` and `cudaGraphAddDependencies` for CUDA 13 (#2525 ) * [CUDA] Update cudaMemAdvise and cudaGraphAddDependencies for CUDA 13 These functions' signatures changed in CUDA 13, so we differentiate between CUDA 13 and preceding releases at compile time. * Mention NVIDIA in ACKNOWLEDGMENTS.md	2025-08-21 19:57:20 -07:00
Cheng	f6819a1f26	Fix warning 186-D from nvcc (#2527 )	2025-08-22 10:29:55 +09:00
Awni Hannun	f93f87c802	nccl dep + default for cuda (#2526 )	2025-08-21 17:57:49 -07:00