Merge build-cuda and build-linux actions (#2783)

2025-12-16 01:49:05 +08:00 · 2025-11-25 20:06:42 +09:00
parent f8bd675655
commit c9f4dc851f
11 changed files with 120 additions and 94 deletions
--- a/.github/actions/build-cuda-release/action.yml
+++ b/.github/actions/build-cuda-release/action.yml
@@ -1,18 +1,13 @@
 name: 'Build CUDA wheel'
 description: 'Build CUDA wheel'
 inputs:
  toolkit:
    description: 'The CUDA toolkit'
    required: true
 runs:
  using: "composite"
  steps:
    - name: Build package
      shell: bash
      env:
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
+        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON
      run: |
        pip install auditwheel build patchelf setuptools
        python setup.py clean --all
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -1,26 +0,0 @@
 name: 'Build and Test with CUDA'
 description: 'Build and test MLX with CUDA'
 inputs:
  toolkit:
    description: 'The CUDA toolkit'
    required: true
 runs:
  using: "composite"
  steps:
    - name: Install Python package
      shell: bash
      env:
        DEBUG: 1
        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
      run: pip install --no-build-isolation -e ".[dev]" -v
    - name: Build CPP only
      shell: bash
      run: |
        cmake . -B build \
          -DMLX_BUILD_CUDA=ON \
          -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc \
          -DCMAKE_BUILD_TYPE=DEBUG
        cmake --build build -j $(nproc)
--- a/.github/actions/build-linux/action.yml
+++ b/.github/actions/build-linux/action.yml
@@ -1,25 +1,41 @@
 name: 'Build and Test on Linux'
-description: 'Build and test MLX on Linux'
+
 inputs:
  toolkit:
    description: 'The toolkit to build with'
    required: false
    default: 'cpu'
 runs:
  using: "composite"
  steps:
    - name: Install Python package
      id: python_build
      shell: sh
      env:
        CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
        DEBUG: 1
-      run: pip install --no-build-isolation -e ".[dev]" -v
+        CMAKE_ARGS: >-
          -DCMAKE_COMPILE_WARNING_AS_ERROR=ON
          -DMLX_BUILD_CUDA=${{ startsWith(inputs.toolkit, 'cuda') && 'ON' || 'OFF' }}
      run: |
        if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then
          # There is no GPU in arm64 runner, use a common arch.
          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=90a"
          # Can not build tests when the built executables can not run.
          CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF"
        fi
        pip install --no-build-isolation -e ".[dev]" -v
        # Pass the CMAKE_ARGS to following steps.
        echo CMAKE_ARGS="$CMAKE_ARGS" >> $GITHUB_OUTPUT
    - name: Generate package stubs
      shell: sh
      run: |
        pip install typing_extensions
        python setup.py generate_stubs
-    
+
    - name: Build CPP only
      shell: bash
      run: |
-        mkdir -p build && cd build
+        cmake . -B build -DCMAKE_BUILD_TYPE=Debug ${{ steps.python_build.outputs.CMAKE_ARGS }}
-        cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
+        cmake --build build -j $(nproc)
        make -j $(nproc)
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -51,8 +51,6 @@ runs:
        # Note: the CI machine does not meet CUDA 13's driver requirement.
        # Compatibility matrix:
        # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
        # The `nvcc` is installed into `/usr/local/cuda-VERSION/bin/nvcc` - but
        # it's *not* on the default toolkit path.
        PACKAGES: |
          {
            "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6",
@@ -60,13 +58,16 @@ runs:
            "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0"
          }
      run: |
-        export ARCH=${{ runner.arch == 'arm64' && 'arm64' || 'x86_64' }}
+        # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is
        # Jetson specific. SBSA means Arm Server Base System Architecture.
        ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }}
        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
        sudo dpkg -i cuda-keyring_1.1-1_all.deb
        sudo apt-get update
        sudo apt-get install -y \
            libnccl2 libnccl-dev \
            ${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
        echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH
    - name: CUDA packages and driver report
      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
--- a/.github/actions/test-linux/action.yml
+++ b/.github/actions/test-linux/action.yml
@@ -1,8 +1,8 @@
 name: 'Run Linux tests'
 inputs:
-  cpu-only:
+  has-gpu:
-    description: 'Skip GPU tests'
+    description: 'Run GPU tests'
    required: false
    default: false
@@ -17,7 +17,7 @@ runs:
        echo "::endgroup::"
    - name: Run distributed tests
-      if: ${{ inputs.cpu-only == 'true' }}
+      if: ${{ inputs.has-gpu == 'false' }}
      shell: bash
      run: |
        echo "::group::Distributed tests"
@@ -30,7 +30,7 @@ runs:
        echo "::endgroup::"
    - name: Run Python tests - CPU
-      if: ${{ inputs.cpu-only == 'true' }}
+      if: ${{ inputs.has-gpu == 'false' }}
      shell: bash
      env:
        DEVICE: cpu
@@ -40,7 +40,7 @@ runs:
        echo "::endgroup::"
    - name: Run Python tests - GPU
-      if: ${{ inputs.cpu-only == 'false' }}
+      if: ${{ inputs.has-gpu == 'true' }}
      shell: bash
      env:
        DEVICE: gpu
@@ -59,7 +59,7 @@ runs:
        echo "::endgroup::"
    - name: Run CPP tests - GPU
-      if: ${{ inputs.cpu-only == 'false' }}
+      if: ${{ inputs.has-gpu == 'true' }}
      shell: bash
      env:
        DEVICE: gpu
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -17,29 +17,51 @@ concurrency:
 jobs:
  check_lint:
    name: Check Lint
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v6
      - uses: pre-commit/action@v3.0.1
  linux_build_and_test:
    name: Linux (cpu, ${{ matrix.arch }})
    needs: check_lint
    strategy:
      matrix:
        runner:
          - ubuntu-22.04
          - ubuntu-22.04-arm
      fail-fast: false
-    runs-on: ${{ matrix.runner }}
+      matrix:
        arch: ['x86_64', 'aarch64']
    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
      - uses: ./.github/actions/build-linux
      - uses: ./.github/actions/test-linux
  cuda_build_and_test:
    name: Linux (${{ matrix.toolkit }}, ${{ matrix.arch }})
    if: github.repository == 'ml-explore/mlx'
    needs: check_lint
    strategy:
      fail-fast: false
      matrix:
        arch: ['x86_64', 'aarch64']
        toolkit: ['cuda-12.6', 'cuda-12.9']
    runs-on: ${{ matrix.arch == 'x86_64' && 'gpu-t4-4-core' || 'ubuntu-22.04-arm' }}
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
        with:
-          cpu-only: true
+          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/build-linux
        with:
          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/test-linux
        if: matrix.arch == 'x86_64'
        with:
          has-gpu: true
  mac_build_and_test:
    name: macOS (${{ matrix.macos-target }})
    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
@@ -53,25 +75,8 @@ jobs:
      - uses: ./.github/actions/setup-macos
      - uses: ./.github/actions/build-macos
  cuda_build_and_test:
    if: github.repository == 'ml-explore/mlx'
    strategy:
      fail-fast: false
      matrix:
        toolkit: ['cuda-12.6', 'cuda-12.9']
    runs-on: gpu-t4-4-core
    needs: check_lint
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
        with:
          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/build-cuda
        with:
          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/test-linux
  build_documentation:
    name: Build Documentation
    if: github.repository == 'ml-explore/mlx'
    runs-on: ubuntu-22.04
    needs: check_lint
@@ -80,7 +85,7 @@ jobs:
      - uses: ./.github/actions/build-docs
  linux_fedora_build_cpp:
-    name: Linux Fedora CPP Build (${{ matrix.arch }})
+    name: Linux Fedora (${{ matrix.arch }})
    needs: check_lint
    strategy:
      fail-fast: false
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -52,8 +52,6 @@ jobs:
          python-version: ${{ matrix.python_version }}
      - uses: ./.github/actions/build-linux
      - uses: ./.github/actions/test-linux
        with:
          cpu-only: true
  build_mac_release:
    if: github.repository == 'ml-explore/mlx'
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -139,8 +139,6 @@ jobs:
          toolkit: 'cuda-12.9'
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
          toolkit: 'cuda-12.9'
      - name: Upload artifacts
        uses: actions/upload-artifact@v5
        with:
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -123,14 +123,21 @@ if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
    mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--compress-mode=size>")
 endif()
-# Compute capability >= 7.0 is required for synchronization between CPU/GPU with
+# Use native CUDA arch by default.
 # managed memory.
 if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
  execute_process(
-    COMMAND bash detect_cuda_arch.sh
+    COMMAND __nvcc_device_query
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
    OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES
    OUTPUT_STRIP_TRAILING_WHITESPACE)
  set(UPGRADABLE_ARCHITECTURES "90;100;121")
  if(MLX_CUDA_ARCHITECTURES STREQUAL "")
    message(
      FATAL_ERROR
        "Can not get native CUDA arch, must set MLX_CUDA_ARCHITECTURES")
  elseif(MLX_CUDA_ARCHITECTURES IN_LIST UPGRADABLE_ARCHITECTURES)
    # Use arch-specific compute capability whenever possible.
    set(MLX_CUDA_ARCHITECTURES "${MLX_CUDA_ARCHITECTURES}a")
  endif()
 endif()
 message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
--- a/mlx/backend/cuda/detect_cuda_arch.sh
+++ b/mlx/backend/cuda/detect_cuda_arch.sh
@@ -1,13 +0,0 @@
 #!/bin/bash
 arch=`__nvcc_device_query`
 case "$arch" in
    "90")
        echo "90a" ;;
    "100")
        echo "100a" ;;
    "121")
        echo "121a" ;;
    *)
        echo "native" ;;
 esac
--- a/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp
+++ b/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp
@@ -5,3 +5,48 @@
 ncclResult_t ncclGetUniqueId(ncclUniqueId*) {
  return ncclSuccess;
 }
 const char* ncclGetErrorString(ncclResult_t result) {
  return nullptr;
 }
 ncclResult_t
 ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
  return ncclSuccess;
 }
 ncclResult_t ncclCommDestroy(ncclComm_t comm) {
  return ncclSuccess;
 }
 ncclResult_t ncclAllGather(
    const void* sendbuff,
    void* recvbuff,
    size_t sendcount,
    ncclDataType_t datatype,
    ncclComm_t comm,
    cudaStream_t stream) {
  return ncclSuccess;
 }
 ncclResult_t ncclAllReduce(
    const void* sendbuff,
    void* recvbuff,
    size_t count,
    ncclDataType_t datatype,
    ncclRedOp_t op,
    ncclComm_t comm,
    cudaStream_t stream) {
  return ncclSuccess;
 }
 ncclResult_t ncclReduceScatter(
    const void* sendbuff,
    void* recvbuff,
    size_t recvcount,
    ncclDataType_t datatype,
    ncclRedOp_t op,
    ncclComm_t comm,
    cudaStream_t stream) {
  return ncclSuccess;
 }