From c9f4dc851f4baa88f713cfc0215bc7b8fe2a87dc Mon Sep 17 00:00:00 2001
From: Cheng <zcbenz@gmail.com>
Date: Tue, 25 Nov 2025 20:06:42 +0900
Subject: [PATCH] Merge build-cuda and build-linux actions (#2783)

---
 .github/actions/build-cuda-release/action.yml |  7 +--
 .github/actions/build-cuda/action.yml         | 26 ---------
 .github/actions/build-linux/action.yml        | 30 +++++++---
 .github/actions/setup-linux/action.yml        |  7 ++-
 .github/actions/test-linux/action.yml         | 12 ++--
 .../{pull_request.yml => build_and_test.yml}  | 55 ++++++++++---------
 .github/workflows/nightly.yml                 |  2 -
 .github/workflows/release.yml                 |  2 -
 mlx/backend/cuda/CMakeLists.txt               | 15 +++--
 mlx/backend/cuda/detect_cuda_arch.sh          | 13 -----
 mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp | 45 +++++++++++++++
 11 files changed, 120 insertions(+), 94 deletions(-)
 delete mode 100644 .github/actions/build-cuda/action.yml
 rename .github/workflows/{pull_request.yml => build_and_test.yml} (79%)
 delete mode 100644 mlx/backend/cuda/detect_cuda_arch.sh

diff --git a/.github/actions/build-cuda-release/action.yml b/.github/actions/build-cuda-release/action.yml
index 2976fa231..d3fe4c301 100644
--- a/.github/actions/build-cuda-release/action.yml
+++ b/.github/actions/build-cuda-release/action.yml
@@ -1,18 +1,13 @@
 name: 'Build CUDA wheel'
 description: 'Build CUDA wheel'
 
-inputs:
-  toolkit:
-    description: 'The CUDA toolkit'
-    required: true
-
 runs:
   using: "composite"
   steps:
     - name: Build package
       shell: bash
       env:
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
+        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON
       run: |
         pip install auditwheel build patchelf setuptools
         python setup.py clean --all
diff --git a/.github/actions/build-cuda/action.yml b/.github/actions/build-cuda/action.yml
deleted file mode 100644
index 2a3b39883..000000000
--- a/.github/actions/build-cuda/action.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: 'Build and Test with CUDA'
-description: 'Build and test MLX with CUDA'
-
-inputs:
-  toolkit:
-    description: 'The CUDA toolkit'
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Python package
-      shell: bash
-      env:
-        DEBUG: 1
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
-      run: pip install --no-build-isolation -e ".[dev]" -v
-
-    - name: Build CPP only
-      shell: bash
-      run: |
-        cmake . -B build \
-          -DMLX_BUILD_CUDA=ON \
-          -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc \
-          -DCMAKE_BUILD_TYPE=DEBUG
-        cmake --build build -j $(nproc)
diff --git a/.github/actions/build-linux/action.yml b/.github/actions/build-linux/action.yml
index 6273ab8de..337b94a2e 100644
--- a/.github/actions/build-linux/action.yml
+++ b/.github/actions/build-linux/action.yml
@@ -1,25 +1,41 @@
 name: 'Build and Test on Linux'
-description: 'Build and test MLX on Linux'
+
+inputs:
+  toolkit:
+    description: 'The toolkit to build with'
+    required: false
+    default: 'cpu'
 
 runs:
   using: "composite"
   steps:
     - name: Install Python package
+      id: python_build
       shell: sh
       env:
-        CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
         DEBUG: 1
-      run: pip install --no-build-isolation -e ".[dev]" -v
+        CMAKE_ARGS: >-
+          -DCMAKE_COMPILE_WARNING_AS_ERROR=ON
+          -DMLX_BUILD_CUDA=${{ startsWith(inputs.toolkit, 'cuda') && 'ON' || 'OFF' }}
+      run: |
+        if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then
+          # There is no GPU in arm64 runner, use a common arch.
+          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=90a"
+          # Can not build tests when the built executables can not run.
+          CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF"
+        fi
+        pip install --no-build-isolation -e ".[dev]" -v
+        # Pass the CMAKE_ARGS to following steps.
+        echo CMAKE_ARGS="$CMAKE_ARGS" >> $GITHUB_OUTPUT
 
     - name: Generate package stubs
       shell: sh
       run: |
         pip install typing_extensions
         python setup.py generate_stubs
-    
+
     - name: Build CPP only
       shell: bash
       run: |
-        mkdir -p build && cd build
-        cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
-        make -j $(nproc)
+        cmake . -B build -DCMAKE_BUILD_TYPE=Debug ${{ steps.python_build.outputs.CMAKE_ARGS }}
+        cmake --build build -j $(nproc)
diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
index f6cfe1060..721a097a3 100644
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -51,8 +51,6 @@ runs:
         # Note: the CI machine does not meet CUDA 13's driver requirement.
         # Compatibility matrix:
         # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
-        # The `nvcc` is installed into `/usr/local/cuda-VERSION/bin/nvcc` - but
-        # it's *not* on the default toolkit path.
         PACKAGES: |
           {
             "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6",
@@ -60,13 +58,16 @@ runs:
             "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0"
           }
       run: |
-        export ARCH=${{ runner.arch == 'arm64' && 'arm64' || 'x86_64' }}
+        # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is
+        # Jetson specific. SBSA means Arm Server Base System Architecture.
+        ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }}
         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
         sudo dpkg -i cuda-keyring_1.1-1_all.deb
         sudo apt-get update
         sudo apt-get install -y \
             libnccl2 libnccl-dev \
             ${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
+        echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH
 
     - name: CUDA packages and driver report
       if: ${{ startsWith(inputs.toolkit, 'cuda') }}
diff --git a/.github/actions/test-linux/action.yml b/.github/actions/test-linux/action.yml
index 441da7d92..3258670ef 100644
--- a/.github/actions/test-linux/action.yml
+++ b/.github/actions/test-linux/action.yml
@@ -1,8 +1,8 @@
 name: 'Run Linux tests'
 
 inputs:
-  cpu-only:
-    description: 'Skip GPU tests'
+  has-gpu:
+    description: 'Run GPU tests'
     required: false
     default: false
 
@@ -17,7 +17,7 @@ runs:
         echo "::endgroup::"
 
     - name: Run distributed tests
-      if: ${{ inputs.cpu-only == 'true' }}
+      if: ${{ inputs.has-gpu == 'false' }}
       shell: bash
       run: |
         echo "::group::Distributed tests"
@@ -30,7 +30,7 @@ runs:
         echo "::endgroup::"
 
     - name: Run Python tests - CPU
-      if: ${{ inputs.cpu-only == 'true' }}
+      if: ${{ inputs.has-gpu == 'false' }}
       shell: bash
       env:
         DEVICE: cpu
@@ -40,7 +40,7 @@ runs:
         echo "::endgroup::"
 
     - name: Run Python tests - GPU
-      if: ${{ inputs.cpu-only == 'false' }}
+      if: ${{ inputs.has-gpu == 'true' }}
       shell: bash
       env:
         DEVICE: gpu
@@ -59,7 +59,7 @@ runs:
         echo "::endgroup::"
 
     - name: Run CPP tests - GPU
-      if: ${{ inputs.cpu-only == 'false' }}
+      if: ${{ inputs.has-gpu == 'true' }}
       shell: bash
       env:
         DEVICE: gpu
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/build_and_test.yml
similarity index 79%
rename from .github/workflows/pull_request.yml
rename to .github/workflows/build_and_test.yml
index db63a9ad1..34ff55054 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/build_and_test.yml
@@ -17,29 +17,51 @@ concurrency:
 
 jobs:
   check_lint:
+    name: Check Lint
     runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v6
       - uses: pre-commit/action@v3.0.1
 
   linux_build_and_test:
+    name: Linux (cpu, ${{ matrix.arch }})
     needs: check_lint
     strategy:
-      matrix:
-        runner:
-          - ubuntu-22.04
-          - ubuntu-22.04-arm
       fail-fast: false
-    runs-on: ${{ matrix.runner }}
+      matrix:
+        arch: ['x86_64', 'aarch64']
+    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
     steps:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/setup-linux
       - uses: ./.github/actions/build-linux
       - uses: ./.github/actions/test-linux
+
+  cuda_build_and_test:
+    name: Linux (${{ matrix.toolkit }}, ${{ matrix.arch }})
+    if: github.repository == 'ml-explore/mlx'
+    needs: check_lint
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: ['x86_64', 'aarch64']
+        toolkit: ['cuda-12.6', 'cuda-12.9']
+    runs-on: ${{ matrix.arch == 'x86_64' && 'gpu-t4-4-core' || 'ubuntu-22.04-arm' }}
+    steps:
+      - uses: actions/checkout@v6
+      - uses: ./.github/actions/setup-linux
         with:
-          cpu-only: true
+          toolkit: ${{ matrix.toolkit }}
+      - uses: ./.github/actions/build-linux
+        with:
+          toolkit: ${{ matrix.toolkit }}
+      - uses: ./.github/actions/test-linux
+        if: matrix.arch == 'x86_64'
+        with:
+          has-gpu: true
 
   mac_build_and_test:
+    name: macOS (${{ matrix.macos-target }})
     if: github.repository == 'ml-explore/mlx'
     strategy:
       matrix:
@@ -53,25 +75,8 @@ jobs:
       - uses: ./.github/actions/setup-macos
       - uses: ./.github/actions/build-macos
 
-  cuda_build_and_test:
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      fail-fast: false
-      matrix:
-        toolkit: ['cuda-12.6', 'cuda-12.9']
-    runs-on: gpu-t4-4-core
-    needs: check_lint
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-linux
-        with:
-          toolkit: ${{ matrix.toolkit }}
-      - uses: ./.github/actions/build-cuda
-        with:
-          toolkit: ${{ matrix.toolkit }}
-      - uses: ./.github/actions/test-linux
-
   build_documentation:
+    name: Build Documentation
     if: github.repository == 'ml-explore/mlx'
     runs-on: ubuntu-22.04
     needs: check_lint
@@ -80,7 +85,7 @@ jobs:
       - uses: ./.github/actions/build-docs
 
   linux_fedora_build_cpp:
-    name: Linux Fedora CPP Build (${{ matrix.arch }})
+    name: Linux Fedora (${{ matrix.arch }})
     needs: check_lint
     strategy:
       fail-fast: false
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index c79c07ea1..479748f8a 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -52,8 +52,6 @@ jobs:
           python-version: ${{ matrix.python_version }}
       - uses: ./.github/actions/build-linux
       - uses: ./.github/actions/test-linux
-        with:
-          cpu-only: true
 
   build_mac_release:
     if: github.repository == 'ml-explore/mlx'
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 27bd7081a..5cc99dac2 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -139,8 +139,6 @@ jobs:
           toolkit: 'cuda-12.9'
       - name: Build Python package
         uses: ./.github/actions/build-cuda-release
-        with:
-          toolkit: 'cuda-12.9'
       - name: Upload artifacts
         uses: actions/upload-artifact@v5
         with:
diff --git a/mlx/backend/cuda/CMakeLists.txt b/mlx/backend/cuda/CMakeLists.txt
index e11a18f95..7986c09d8 100644
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -123,14 +123,21 @@ if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
     mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--compress-mode=size>")
 endif()
 
-# Compute capability >= 7.0 is required for synchronization between CPU/GPU with
-# managed memory.
+# Use native CUDA arch by default.
 if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
   execute_process(
-    COMMAND bash detect_cuda_arch.sh
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMAND __nvcc_device_query
     OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES
     OUTPUT_STRIP_TRAILING_WHITESPACE)
+  set(UPGRADABLE_ARCHITECTURES "90;100;121")
+  if(MLX_CUDA_ARCHITECTURES STREQUAL "")
+    message(
+      FATAL_ERROR
+        "Can not get native CUDA arch, must set MLX_CUDA_ARCHITECTURES")
+  elseif(MLX_CUDA_ARCHITECTURES IN_LIST UPGRADABLE_ARCHITECTURES)
+    # Use arch-specific compute capability whenever possible.
+    set(MLX_CUDA_ARCHITECTURES "${MLX_CUDA_ARCHITECTURES}a")
+  endif()
 endif()
 message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
diff --git a/mlx/backend/cuda/detect_cuda_arch.sh b/mlx/backend/cuda/detect_cuda_arch.sh
deleted file mode 100644
index 9d7c01a3e..000000000
--- a/mlx/backend/cuda/detect_cuda_arch.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-arch=`__nvcc_device_query`
-case "$arch" in
-    "90")
-        echo "90a" ;;
-    "100")
-        echo "100a" ;;
-    "121")
-        echo "121a" ;;
-    *)
-        echo "native" ;;
-esac
diff --git a/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp b/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp
index 171340f67..435936236 100644
--- a/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp
+++ b/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp
@@ -5,3 +5,48 @@
 ncclResult_t ncclGetUniqueId(ncclUniqueId*) {
   return ncclSuccess;
 }
+
+const char* ncclGetErrorString(ncclResult_t result) {
+  return nullptr;
+}
+
+ncclResult_t
+ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclAllGather(
+    const void* sendbuff,
+    void* recvbuff,
+    size_t sendcount,
+    ncclDataType_t datatype,
+    ncclComm_t comm,
+    cudaStream_t stream) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclAllReduce(
+    const void* sendbuff,
+    void* recvbuff,
+    size_t count,
+    ncclDataType_t datatype,
+    ncclRedOp_t op,
+    ncclComm_t comm,
+    cudaStream_t stream) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclReduceScatter(
+    const void* sendbuff,
+    void* recvbuff,
+    size_t recvcount,
+    ncclDataType_t datatype,
+    ncclRedOp_t op,
+    ncclComm_t comm,
+    cudaStream_t stream) {
+  return ncclSuccess;
+}