Merge build-cuda and build-linux actions (#2783)

2025-12-16 01:49:05 +08:00 · 2025-11-25 20:06:42 +09:00
parent f8bd675655
commit c9f4dc851f
11 changed files with 120 additions and 94 deletions
--- a/.github/actions/build-cuda-release/action.yml
+++ b/.github/actions/build-cuda-release/action.yml
@@ -1,18 +1,13 @@
 name: 'Build CUDA wheel'
 description: 'Build CUDA wheel'

-inputs:
-  toolkit:
-    description: 'The CUDA toolkit'
-    required: true
-
 runs:
  using: "composite"
  steps:
    - name: Build package
      shell: bash
      env:
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
+        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON
      run: |
        pip install auditwheel build patchelf setuptools
        python setup.py clean --all
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -1,26 +0,0 @@
-name: 'Build and Test with CUDA'
-description: 'Build and test MLX with CUDA'
-
-inputs:
-  toolkit:
-    description: 'The CUDA toolkit'
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Python package
-      shell: bash
-      env:
-        DEBUG: 1
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
-      run: pip install --no-build-isolation -e ".[dev]" -v
-
-    - name: Build CPP only
-      shell: bash
-      run: |
-        cmake . -B build \
-          -DMLX_BUILD_CUDA=ON \
-          -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc \
-          -DCMAKE_BUILD_TYPE=DEBUG
-        cmake --build build -j $(nproc)
--- a/.github/actions/build-linux/action.yml
+++ b/.github/actions/build-linux/action.yml
@@ -1,15 +1,32 @@
 name: 'Build and Test on Linux'
-description: 'Build and test MLX on Linux'
+
+inputs:
+  toolkit:
+    description: 'The toolkit to build with'
+    required: false
+    default: 'cpu'

 runs:
  using: "composite"
  steps:
    - name: Install Python package
+      id: python_build
      shell: sh
      env:
-        CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
        DEBUG: 1
-      run: pip install --no-build-isolation -e ".[dev]" -v
+        CMAKE_ARGS: >-
+          -DCMAKE_COMPILE_WARNING_AS_ERROR=ON
+          -DMLX_BUILD_CUDA=${{ startsWith(inputs.toolkit, 'cuda') && 'ON' || 'OFF' }}
+      run: |
+        if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then
+          # There is no GPU in arm64 runner, use a common arch.
+          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=90a"
+          # Can not build tests when the built executables can not run.
+          CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF"
+        fi
+        pip install --no-build-isolation -e ".[dev]" -v
+        # Pass the CMAKE_ARGS to following steps.
+        echo CMAKE_ARGS="$CMAKE_ARGS" >> $GITHUB_OUTPUT

    - name: Generate package stubs
      shell: sh
@@ -20,6 +37,5 @@ runs:
    - name: Build CPP only
      shell: bash
      run: |
-        mkdir -p build && cd build
-        cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
-        make -j $(nproc)
+        cmake . -B build -DCMAKE_BUILD_TYPE=Debug ${{ steps.python_build.outputs.CMAKE_ARGS }}
+        cmake --build build -j $(nproc)
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -51,8 +51,6 @@ runs:
        # Note: the CI machine does not meet CUDA 13's driver requirement.
        # Compatibility matrix:
        # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
-        # The `nvcc` is installed into `/usr/local/cuda-VERSION/bin/nvcc` - but
-        # it's *not* on the default toolkit path.
        PACKAGES: |
          {
            "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6",
@@ -60,13 +58,16 @@ runs:
            "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0"
          }
      run: |
-        export ARCH=${{ runner.arch == 'arm64' && 'arm64' || 'x86_64' }}
+        # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is
+        # Jetson specific. SBSA means Arm Server Base System Architecture.
+        ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }}
        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
        sudo dpkg -i cuda-keyring_1.1-1_all.deb
        sudo apt-get update
        sudo apt-get install -y \
            libnccl2 libnccl-dev \
            ${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
+        echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH

    - name: CUDA packages and driver report
      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
--- a/.github/actions/test-linux/action.yml
+++ b/.github/actions/test-linux/action.yml
@@ -1,8 +1,8 @@
 name: 'Run Linux tests'

 inputs:
-  cpu-only:
-    description: 'Skip GPU tests'
+  has-gpu:
+    description: 'Run GPU tests'
    required: false
    default: false

@@ -17,7 +17,7 @@ runs:
        echo "::endgroup::"

    - name: Run distributed tests
-      if: ${{ inputs.cpu-only == 'true' }}
+      if: ${{ inputs.has-gpu == 'false' }}
      shell: bash
      run: |
        echo "::group::Distributed tests"
@@ -30,7 +30,7 @@ runs:
        echo "::endgroup::"

    - name: Run Python tests - CPU
-      if: ${{ inputs.cpu-only == 'true' }}
+      if: ${{ inputs.has-gpu == 'false' }}
      shell: bash
      env:
        DEVICE: cpu
@@ -40,7 +40,7 @@ runs:
        echo "::endgroup::"

    - name: Run Python tests - GPU
-      if: ${{ inputs.cpu-only == 'false' }}
+      if: ${{ inputs.has-gpu == 'true' }}
      shell: bash
      env:
        DEVICE: gpu
@@ -59,7 +59,7 @@ runs:
        echo "::endgroup::"

    - name: Run CPP tests - GPU
-      if: ${{ inputs.cpu-only == 'false' }}
+      if: ${{ inputs.has-gpu == 'true' }}
      shell: bash
      env:
        DEVICE: gpu
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -17,29 +17,51 @@ concurrency:

 jobs:
  check_lint:
+    name: Check Lint
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v6
      - uses: pre-commit/action@v3.0.1

  linux_build_and_test:
+    name: Linux (cpu, ${{ matrix.arch }})
    needs: check_lint
    strategy:
-      matrix:
-        runner:
-          - ubuntu-22.04
-          - ubuntu-22.04-arm
      fail-fast: false
-    runs-on: ${{ matrix.runner }}
+      matrix:
+        arch: ['x86_64', 'aarch64']
+    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
      - uses: ./.github/actions/build-linux
      - uses: ./.github/actions/test-linux
+
+  cuda_build_and_test:
+    name: Linux (${{ matrix.toolkit }}, ${{ matrix.arch }})
+    if: github.repository == 'ml-explore/mlx'
+    needs: check_lint
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: ['x86_64', 'aarch64']
+        toolkit: ['cuda-12.6', 'cuda-12.9']
+    runs-on: ${{ matrix.arch == 'x86_64' && 'gpu-t4-4-core' || 'ubuntu-22.04-arm' }}
+    steps:
+      - uses: actions/checkout@v6
+      - uses: ./.github/actions/setup-linux
        with:
-          cpu-only: true
+          toolkit: ${{ matrix.toolkit }}
+      - uses: ./.github/actions/build-linux
+        with:
+          toolkit: ${{ matrix.toolkit }}
+      - uses: ./.github/actions/test-linux
+        if: matrix.arch == 'x86_64'
+        with:
+          has-gpu: true

  mac_build_and_test:
+    name: macOS (${{ matrix.macos-target }})
    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
@@ -53,25 +75,8 @@ jobs:
      - uses: ./.github/actions/setup-macos
      - uses: ./.github/actions/build-macos

-  cuda_build_and_test:
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      fail-fast: false
-      matrix:
-        toolkit: ['cuda-12.6', 'cuda-12.9']
-    runs-on: gpu-t4-4-core
-    needs: check_lint
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-linux
-        with:
-          toolkit: ${{ matrix.toolkit }}
-      - uses: ./.github/actions/build-cuda
-        with:
-          toolkit: ${{ matrix.toolkit }}
-      - uses: ./.github/actions/test-linux
-
  build_documentation:
+    name: Build Documentation
    if: github.repository == 'ml-explore/mlx'
    runs-on: ubuntu-22.04
    needs: check_lint
@@ -80,7 +85,7 @@ jobs:
      - uses: ./.github/actions/build-docs

  linux_fedora_build_cpp:
-    name: Linux Fedora CPP Build (${{ matrix.arch }})
+    name: Linux Fedora (${{ matrix.arch }})
    needs: check_lint
    strategy:
      fail-fast: false
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -52,8 +52,6 @@ jobs:
          python-version: ${{ matrix.python_version }}
      - uses: ./.github/actions/build-linux
      - uses: ./.github/actions/test-linux
-        with:
-          cpu-only: true

  build_mac_release:
    if: github.repository == 'ml-explore/mlx'
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -139,8 +139,6 @@ jobs:
          toolkit: 'cuda-12.9'
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
-        with:
-          toolkit: 'cuda-12.9'
      - name: Upload artifacts
        uses: actions/upload-artifact@v5
        with:
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -123,14 +123,21 @@ if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
    mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--compress-mode=size>")
 endif()

-# Compute capability >= 7.0 is required for synchronization between CPU/GPU with
-# managed memory.
+# Use native CUDA arch by default.
 if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
  execute_process(
-    COMMAND bash detect_cuda_arch.sh
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMAND __nvcc_device_query
    OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES
    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  set(UPGRADABLE_ARCHITECTURES "90;100;121")
+  if(MLX_CUDA_ARCHITECTURES STREQUAL "")
+    message(
+      FATAL_ERROR
+        "Can not get native CUDA arch, must set MLX_CUDA_ARCHITECTURES")
+  elseif(MLX_CUDA_ARCHITECTURES IN_LIST UPGRADABLE_ARCHITECTURES)
+    # Use arch-specific compute capability whenever possible.
+    set(MLX_CUDA_ARCHITECTURES "${MLX_CUDA_ARCHITECTURES}a")
+  endif()
 endif()
 message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
--- a/mlx/backend/cuda/detect_cuda_arch.sh
+++ b/mlx/backend/cuda/detect_cuda_arch.sh
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-arch=`__nvcc_device_query`
-case "$arch" in
-    "90")
-        echo "90a" ;;
-    "100")
-        echo "100a" ;;
-    "121")
-        echo "121a" ;;
-    *)
-        echo "native" ;;
-esac
--- a/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp
+++ b/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp
@@ -5,3 +5,48 @@
 ncclResult_t ncclGetUniqueId(ncclUniqueId*) {
  return ncclSuccess;
 }
+
+const char* ncclGetErrorString(ncclResult_t result) {
+  return nullptr;
+}
+
+ncclResult_t
+ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclAllGather(
+    const void* sendbuff,
+    void* recvbuff,
+    size_t sendcount,
+    ncclDataType_t datatype,
+    ncclComm_t comm,
+    cudaStream_t stream) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclAllReduce(
+    const void* sendbuff,
+    void* recvbuff,
+    size_t count,
+    ncclDataType_t datatype,
+    ncclRedOp_t op,
+    ncclComm_t comm,
+    cudaStream_t stream) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclReduceScatter(
+    const void* sendbuff,
+    void* recvbuff,
+    size_t recvcount,
+    ncclDataType_t datatype,
+    ncclRedOp_t op,
+    ncclComm_t comm,
+    cudaStream_t stream) {
+  return ncclSuccess;
+}