From c35f4d089abac69de5e337aa041887d767f6553a Mon Sep 17 00:00:00 2001
From: Awni Hannun <awni@apple.com>
Date: Tue, 10 Jun 2025 21:19:47 -0700
Subject: [PATCH] start cuda circle config (#2256)

* rebase

* fix metal kernel linking issue on cuda

* start cuda circle config
---
 .circleci/config.yml              | 26 ++++++++++++++++++++++++++
 mlx/CMakeLists.txt                |  3 +++
 mlx/backend/cuda/CMakeLists.txt   |  1 +
 mlx/backend/cuda/cuda.cpp         | 11 +++++++++++
 mlx/backend/cuda/cuda.h           | 10 ++++++++++
 mlx/backend/cuda/no_cuda.cpp      | 11 +++++++++++
 mlx/backend/metal/no_metal.cpp    | 24 ++++++++++++++++++++++--
 mlx/backend/no_gpu/primitives.cpp | 13 -------------
 mlx/mlx.h                         |  1 +
 python/src/array.cpp              | 10 +++++-----
 python/src/device.cpp             |  5 +++++
 python/tests/test_array.py        |  2 +-
 python/tests/test_device.py       |  8 ++++----
 python/tests/test_optimizers.py   |  2 +-
 14 files changed, 101 insertions(+), 26 deletions(-)
 create mode 100644 mlx/backend/cuda/cuda.cpp
 create mode 100644 mlx/backend/cuda/cuda.h
 create mode 100644 mlx/backend/cuda/no_cuda.cpp

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 6dc7ec4df..808242f9b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -212,6 +212,29 @@ jobs:
               METAL_DEBUG_ERROR_MODE=0 \
               python -m xmlrunner discover -v python/tests -o test-results/gpu_jit
 
+  cuda_build_and_test:
+    machine:
+      image: linux-cuda-12:default
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - run:
+          name: Install Python package
+          command: |
+            sudo apt-get update
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
+            python -m venv env
+            source env/bin/activate
+            CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              pip install -e ".[dev]"
+      - run:
+          name: Run Python tests
+          command: |
+            source env/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
+
   build_release:
     parameters:
       python_version:
@@ -348,6 +371,7 @@ workflows:
             parameters:
               macosx_deployment_target: ["13.5", "14.0"]
       - linux_build_and_test
+      - cuda_build_and_test 
       - build_documentation 
 
   build_pypi_release:
@@ -455,6 +479,8 @@ workflows:
               macosx_deployment_target: ["13.5", "14.0"]
       - linux_build_and_test:
           requires: [ hold ]
+      - cuda_build_and_test:
+          requires: [ hold ]
   nightly_build:
     when:
       and:
diff --git a/mlx/CMakeLists.txt b/mlx/CMakeLists.txt
index ce921b276..7aa648533 100644
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -55,6 +55,9 @@ endif()
 
 if(MLX_BUILD_CUDA)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda)
+else()
+  target_sources(mlx
+                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda/no_cuda.cpp)
 endif()
 
 if(MLX_BUILD_METAL OR MLX_BUILD_CUDA)
diff --git a/mlx/backend/cuda/CMakeLists.txt b/mlx/backend/cuda/CMakeLists.txt
index 7ffbcb2d3..9d9657e1f 100644
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -12,6 +12,7 @@ target_sources(
           ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_dynamic.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_input.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/cuda.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
diff --git a/mlx/backend/cuda/cuda.cpp b/mlx/backend/cuda/cuda.cpp
new file mode 100644
index 000000000..ceb4d7dfe
--- /dev/null
+++ b/mlx/backend/cuda/cuda.cpp
@@ -0,0 +1,11 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/cuda.h"
+
+namespace mlx::core::cu {
+
+bool is_available() {
+  return true;
+}
+
+} // namespace mlx::core::cu
diff --git a/mlx/backend/cuda/cuda.h b/mlx/backend/cuda/cuda.h
new file mode 100644
index 000000000..2c6a5c724
--- /dev/null
+++ b/mlx/backend/cuda/cuda.h
@@ -0,0 +1,10 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+namespace mlx::core::cu {
+
+/* Check if the CUDA backend is available. */
+bool is_available();
+
+} // namespace mlx::core::cu
diff --git a/mlx/backend/cuda/no_cuda.cpp b/mlx/backend/cuda/no_cuda.cpp
new file mode 100644
index 000000000..8a394c9e3
--- /dev/null
+++ b/mlx/backend/cuda/no_cuda.cpp
@@ -0,0 +1,11 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/cuda.h"
+
+namespace mlx::core::cu {
+
+bool is_available() {
+  return false;
+}
+
+} // namespace mlx::core::cu
diff --git a/mlx/backend/metal/no_metal.cpp b/mlx/backend/metal/no_metal.cpp
index b6142b280..9785e07c2 100644
--- a/mlx/backend/metal/no_metal.cpp
+++ b/mlx/backend/metal/no_metal.cpp
@@ -3,8 +3,11 @@
 #include <stdexcept>
 
 #include "mlx/backend/metal/metal.h"
+#include "mlx/fast.h"
 
-namespace mlx::core::metal {
+namespace mlx::core {
+
+namespace metal {
 
 bool is_available() {
   return false;
@@ -19,4 +22,21 @@ device_info() {
       "[metal::device_info] Cannot get device info without metal backend");
 };
 
-} // namespace mlx::core::metal
+} // namespace metal
+
+namespace fast {
+
+MetalKernelFunction metal_kernel(
+    const std::string&,
+    const std::vector<std::string>&,
+    const std::vector<std::string>&,
+    const std::string&,
+    const std::string&,
+    bool ensure_row_contiguous,
+    bool atomic_outputs) {
+  throw std::runtime_error("[metal_kernel] No GPU back-end.");
+}
+
+} // namespace fast
+
+} // namespace mlx::core
diff --git a/mlx/backend/no_gpu/primitives.cpp b/mlx/backend/no_gpu/primitives.cpp
index 849cbf83e..409aa2c89 100644
--- a/mlx/backend/no_gpu/primitives.cpp
+++ b/mlx/backend/no_gpu/primitives.cpp
@@ -2,7 +2,6 @@
 
 #include "mlx/primitives.h"
 #include "mlx/distributed/primitives.h"
-#include "mlx/fast.h"
 #include "mlx/fast_primitives.h"
 
 #define NO_GPU_MULTI(func)                                             \
@@ -156,18 +155,6 @@ NO_GPU_USE_FALLBACK(RoPE)
 NO_GPU(ScaledDotProductAttention)
 NO_GPU_MULTI(AffineQuantize)
 NO_GPU_MULTI(CustomKernel)
-
-MetalKernelFunction metal_kernel(
-    const std::string&,
-    const std::vector<std::string>&,
-    const std::vector<std::string>&,
-    const std::string&,
-    const std::string&,
-    bool ensure_row_contiguous,
-    bool atomic_outputs) {
-  throw std::runtime_error("[metal_kernel] No GPU back-end.");
-}
-
 } // namespace fast
 
 namespace distributed {
diff --git a/mlx/mlx.h b/mlx/mlx.h
index cef8d806d..de3ee392a 100644
--- a/mlx/mlx.h
+++ b/mlx/mlx.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "mlx/array.h"
+#include "mlx/backend/cuda/cuda.h"
 #include "mlx/backend/metal/metal.h"
 #include "mlx/compile.h"
 #include "mlx/device.h"
diff --git a/python/src/array.cpp b/python/src/array.cpp
index 5ba0aaedc..25889d775 100644
--- a/python/src/array.cpp
+++ b/python/src/array.cpp
@@ -17,10 +17,7 @@
 #include "python/src/indexing.h"
 #include "python/src/utils.h"
 
-#include "mlx/device.h"
-#include "mlx/ops.h"
-#include "mlx/transforms.h"
-#include "mlx/utils.h"
+#include "mlx/mlx.h"
 
 namespace mx = mlx::core;
 namespace nb = nanobind;
@@ -461,9 +458,12 @@ void init_array(nb::module_& m) {
       .def(
           "__dlpack_device__",
           [](const mx::array& a) {
+            // See
+            // https://github.com/dmlc/dlpack/blob/5c210da409e7f1e51ddf445134a4376fdbd70d7d/include/dlpack/dlpack.h#L74
             if (mx::metal::is_available()) {
-              // Metal device is available
               return nb::make_tuple(8, 0);
+            } else if (mx::cu::is_available()) {
+              return nb::make_tuple(13, 0);
             } else {
               // CPU device
               return nb::make_tuple(1, 0);
diff --git a/python/src/device.cpp b/python/src/device.cpp
index 85b15dd4d..006a05dc0 100644
--- a/python/src/device.cpp
+++ b/python/src/device.cpp
@@ -58,4 +58,9 @@ void init_device(nb::module_& m) {
       &mx::set_default_device,
       "device"_a,
       R"pbdoc(Set the default device.)pbdoc");
+  m.def(
+      "is_available",
+      &mx::is_available,
+      "device"_a,
+      R"pbdoc(Check if a back-end is available for the given device.)pbdoc");
 }
diff --git a/python/tests/test_array.py b/python/tests/test_array.py
index e63da17df..c22e0a38f 100644
--- a/python/tests/test_array.py
+++ b/python/tests/test_array.py
@@ -198,7 +198,7 @@ class TestInequality(mlx_tests.MLXTestCase):
     def test_dlx_device_type(self):
         a = mx.array([1, 2, 3])
         device_type, device_id = a.__dlpack_device__()
-        self.assertIn(device_type, [1, 8])
+        self.assertIn(device_type, [1, 8, 13])
         self.assertEqual(device_id, 0)
 
         if device_type == 8:
diff --git a/python/tests/test_device.py b/python/tests/test_device.py
index 53826cad7..6793c98d1 100644
--- a/python/tests/test_device.py
+++ b/python/tests/test_device.py
@@ -10,7 +10,7 @@ import mlx_tests
 class TestDefaultDevice(unittest.TestCase):
     def test_mlx_default_device(self):
         device = mx.default_device()
-        if mx.metal.is_available():
+        if mx.is_available(mx.gpu):
             self.assertEqual(device, mx.Device(mx.gpu))
             self.assertEqual(str(device), "Device(gpu, 0)")
             self.assertEqual(device, mx.gpu)
@@ -73,7 +73,7 @@ class TestStream(mlx_tests.MLXTestCase):
         self.assertEqual(s2.device, mx.default_device())
         self.assertNotEqual(s1, s2)
 
-        if mx.metal.is_available():
+        if mx.is_available(mx.gpu):
             s_gpu = mx.default_stream(mx.gpu)
             self.assertEqual(s_gpu.device, mx.gpu)
         else:
@@ -86,7 +86,7 @@ class TestStream(mlx_tests.MLXTestCase):
         s_cpu = mx.new_stream(mx.cpu)
         self.assertEqual(s_cpu.device, mx.cpu)
 
-        if mx.metal.is_available():
+        if mx.is_available(mx.gpu):
             s_gpu = mx.new_stream(mx.gpu)
             self.assertEqual(s_gpu.device, mx.gpu)
         else:
@@ -99,7 +99,7 @@ class TestStream(mlx_tests.MLXTestCase):
 
         a = mx.add(x, y, stream=mx.default_stream(mx.default_device()))
 
-        if mx.metal.is_available():
+        if mx.is_available(mx.gpu):
             b = mx.add(x, y, stream=mx.default_stream(mx.gpu))
             self.assertEqual(a.item(), b.item())
             s_gpu = mx.new_stream(mx.gpu)
diff --git a/python/tests/test_optimizers.py b/python/tests/test_optimizers.py
index ebfe97d80..4943fe662 100644
--- a/python/tests/test_optimizers.py
+++ b/python/tests/test_optimizers.py
@@ -353,7 +353,7 @@ class TestOptimizers(mlx_tests.MLXTestCase):
         self.assertTrue(mx.allclose(result["w"], mx.full((5, 5), 3.0)))
 
 
-class TestSchedulers(unittest.TestCase):
+class TestSchedulers(mlx_tests.MLXTestCase):
     def test_decay_lr(self):
         for optim_class in optimizers_dict.values():
             lr_schedule = opt.step_decay(1e-1, 0.9, 1)