Build and test with multiple CUDA versions (#2780)

2025-12-11 23:14:50 +08:00 · 2025-11-17 09:19:02 +09:00
parent b7214ff01e
commit 472c43a0c8
6 changed files with 52 additions and 33 deletions
--- a/.github/actions/build-cuda-release/action.yml
+++ b/.github/actions/build-cuda-release/action.yml
@@ -2,8 +2,8 @@ name: 'Build CUDA wheel'
 description: 'Build CUDA wheel'

 inputs:
-  nvcc-location:
-    description: 'Location of nvcc compiler'
+  toolkit:
+    description: 'The CUDA toolkit'
    required: true

 runs:
@@ -12,7 +12,7 @@ runs:
    - name: Build package
      shell: bash
      env:
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
+        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
      run: |
        pip install auditwheel build patchelf setuptools
        python setup.py clean --all
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -2,10 +2,9 @@ name: 'Build and Test with CUDA'
 description: 'Build and test MLX with CUDA'

 inputs:
-  nvcc-location:
-    description: 'Location of nvcc compiler'
+  toolkit:
+    description: 'The CUDA toolkit'
    required: true
-    default: '/usr/local/cuda-12.9/bin/nvcc'

 runs:
  using: "composite"
@@ -14,7 +13,7 @@ runs:
      shell: bash
      env:
        DEBUG: 1
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
+        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
      run: pip install --no-build-isolation -e ".[dev]" -v

    - name: Build CPP only
@@ -22,6 +21,6 @@ runs:
      run: |
        cmake . -B build \
          -DMLX_BUILD_CUDA=ON \
-          -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }} \
+          -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc \
          -DCMAKE_BUILD_TYPE=DEBUG
        cmake --build build -j $(nproc)
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -2,14 +2,10 @@ name: 'Setup Linux Environment'
 description: 'Install dependencies for Linux builds'

 inputs:
-  runner-type:
-    description: 'Whether to set this up as a linux or CUDA runner'
+  toolkit:
+    description: 'Which toolkit to install'
    required: false
-    default: 'linux'
-    type: choice
-    options:
-      - linux
-      - cuda
+    default: 'cpu'
  python-version:
    description: 'Version of python to set up'
    required: false
@@ -21,7 +17,7 @@ runs:
    - name: Use ccache
      uses: hendrikmuhs/ccache-action@v1.2
      with:
-        key: ccache-${{ inputs.runner-type }}-${{ runner.arch }}-py${{ inputs.python-version }}
+        key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}-py${{ inputs.python-version }}
        max-size: 1GB

    - name: Install common dependencies
@@ -48,21 +44,33 @@ runs:
      shell: bash
      run: sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev

-    - name: Network CUDA installation from packages
-      if: inputs.runner-type == 'cuda'
-      shell: bash ## Specific to Ubuntu 22.04 & Architecture x86_64
+    - name: Install CUDA toolkit
+      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
+      shell: bash
+      env:
+        # Note: the CI machine does not meet CUDA 13's driver requirement.
+        # Compatibility matrix:
+        # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
+        # The `nvcc` is installed into `/usr/local/cuda-VERSION/bin/nvcc` - but
+        # it's *not* on the default toolkit path.
+        PACKAGES: |
+          {
+            "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6",
+            "cuda-12.8": "libcudnn9-dev-cuda-12 cuda-toolkit-12-8",
+            "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-toolkit-12-9",
+            "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0"
+          }
      run: |
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        export ARCH=${{ runner.arch == 'arm64' && 'arm64' || 'x86_64' }}
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
        sudo dpkg -i cuda-keyring_1.1-1_all.deb
        sudo apt-get update
-        sudo apt-get install -y libcudnn9-dev-cuda-12 libnccl2 libnccl-dev cuda-toolkit-12-9
-      # Note: This installs CUDA 12.9, which is the latest supported by cuDNN 9.x and works with the NVidia 570 drivers
-      # cuda-toolkit by itself installs version 13 (+) and requires updated drives (580+), which require a reboot to function properly.
-      # Compatibility matrix: https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
-      # This also drops `nvcc` into `/usr/local/cuda-12.9/bin/nvcc` - but it's *not* on the default PATH
+        sudo apt-get install -y \
+            libnccl2 libnccl-dev \
+            ${{ fromJson(env.PACKAGES)[inputs.toolkit] }}

-    - name: Package and Driver Report
-      if: inputs.runner-type == 'cuda'
+    - name: CUDA packages and driver report
+      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
      shell: bash
      run: |
        sudo apt-get install -y ubuntu-drivers-common dkms
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -80,13 +80,19 @@ jobs:

  build_cuda_with_tests:
    if: github.repository == 'ml-explore/mlx'
+    strategy:
+      fail-fast: false
+      matrix:
+        toolkit: ['cuda-12.8', 'cuda-12.9']
    runs-on: gpu-t4-4-core
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
-          runner-type: 'cuda'
+          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/build-cuda
+        with:
+          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/test-linux

  build_cuda_release:
@@ -96,11 +102,11 @@ jobs:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
-          runner-type: 'cuda'
+          toolkit: 'cuda-12.9'
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
-          nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
+          toolkit: 'cuda-12.9'
      - name: Upload artifacts
        uses: actions/upload-artifact@v5
        with:
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -49,14 +49,20 @@ jobs:

  cuda_build_and_test:
    if: github.repository == 'ml-explore/mlx'
+    strategy:
+      fail-fast: false
+      matrix:
+        toolkit: ['cuda-12.8', 'cuda-12.9']
    runs-on: gpu-t4-4-core
    needs: check_lint
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
-          runner-type: 'cuda'
+          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/build-cuda
+        with:
+          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/test-linux

  build_documentation:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -136,11 +136,11 @@ jobs:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
-          runner-type: 'cuda'
+          toolkit: 'cuda-12.9'
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
-          nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
+          toolkit: 'cuda-12.9'
      - name: Upload artifacts
        uses: actions/upload-artifact@v5
        with: