From 472c43a0c8eb570ac266ad15b208cf4c3622109c Mon Sep 17 00:00:00 2001 From: Cheng Date: Mon, 17 Nov 2025 09:19:02 +0900 Subject: [PATCH] Build and test with multiple CUDA versions (#2780) --- .github/actions/build-cuda-release/action.yml | 6 +-- .github/actions/build-cuda/action.yml | 9 ++-- .github/actions/setup-linux/action.yml | 46 +++++++++++-------- .github/workflows/nightly.yml | 12 +++-- .github/workflows/pull_request.yml | 8 +++- .github/workflows/release.yml | 4 +- 6 files changed, 52 insertions(+), 33 deletions(-) diff --git a/.github/actions/build-cuda-release/action.yml b/.github/actions/build-cuda-release/action.yml index 22e6a7c25..2976fa231 100644 --- a/.github/actions/build-cuda-release/action.yml +++ b/.github/actions/build-cuda-release/action.yml @@ -2,8 +2,8 @@ name: 'Build CUDA wheel' description: 'Build CUDA wheel' inputs: - nvcc-location: - description: 'Location of nvcc compiler' + toolkit: + description: 'The CUDA toolkit' required: true runs: @@ -12,7 +12,7 @@ runs: - name: Build package shell: bash env: - CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }} + CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc run: | pip install auditwheel build patchelf setuptools python setup.py clean --all diff --git a/.github/actions/build-cuda/action.yml b/.github/actions/build-cuda/action.yml index cb61aa880..2a3b39883 100644 --- a/.github/actions/build-cuda/action.yml +++ b/.github/actions/build-cuda/action.yml @@ -2,10 +2,9 @@ name: 'Build and Test with CUDA' description: 'Build and test MLX with CUDA' inputs: - nvcc-location: - description: 'Location of nvcc compiler' + toolkit: + description: 'The CUDA toolkit' required: true - default: '/usr/local/cuda-12.9/bin/nvcc' runs: using: "composite" @@ -14,7 +13,7 @@ runs: shell: bash env: DEBUG: 1 - CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }} + CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc run: pip install --no-build-isolation -e ".[dev]" -v - name: Build CPP only @@ -22,6 +21,6 @@ runs: run: | cmake . -B build \ -DMLX_BUILD_CUDA=ON \ - -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }} \ + -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc \ -DCMAKE_BUILD_TYPE=DEBUG cmake --build build -j $(nproc) diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml index 6dc2efe99..455ea41c3 100644 --- a/.github/actions/setup-linux/action.yml +++ b/.github/actions/setup-linux/action.yml @@ -2,14 +2,10 @@ name: 'Setup Linux Environment' description: 'Install dependencies for Linux builds' inputs: - runner-type: - description: 'Whether to set this up as a linux or CUDA runner' + toolkit: + description: 'Which toolkit to install' required: false - default: 'linux' - type: choice - options: - - linux - - cuda + default: 'cpu' python-version: description: 'Version of python to set up' required: false @@ -21,7 +17,7 @@ runs: - name: Use ccache uses: hendrikmuhs/ccache-action@v1.2 with: - key: ccache-${{ inputs.runner-type }}-${{ runner.arch }}-py${{ inputs.python-version }} + key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}-py${{ inputs.python-version }} max-size: 1GB - name: Install common dependencies @@ -48,21 +44,33 @@ runs: shell: bash run: sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev - - name: Network CUDA installation from packages - if: inputs.runner-type == 'cuda' - shell: bash ## Specific to Ubuntu 22.04 & Architecture x86_64 + - name: Install CUDA toolkit + if: ${{ startsWith(inputs.toolkit, 'cuda') }} + shell: bash + env: + # Note: the CI machine does not meet CUDA 13's driver requirement. + # Compatibility matrix: + # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html + # The `nvcc` is installed into `/usr/local/cuda-VERSION/bin/nvcc` - but + # it's *not* on the default toolkit path. + PACKAGES: | + { + "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6", + "cuda-12.8": "libcudnn9-dev-cuda-12 cuda-toolkit-12-8", + "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-toolkit-12-9", + "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0" + } run: | - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + export ARCH=${{ runner.arch == 'arm64' && 'arm64' || 'x86_64' }} + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update - sudo apt-get install -y libcudnn9-dev-cuda-12 libnccl2 libnccl-dev cuda-toolkit-12-9 - # Note: This installs CUDA 12.9, which is the latest supported by cuDNN 9.x and works with the NVidia 570 drivers - # cuda-toolkit by itself installs version 13 (+) and requires updated drives (580+), which require a reboot to function properly. - # Compatibility matrix: https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html - # This also drops `nvcc` into `/usr/local/cuda-12.9/bin/nvcc` - but it's *not* on the default PATH + sudo apt-get install -y \ + libnccl2 libnccl-dev \ + ${{ fromJson(env.PACKAGES)[inputs.toolkit] }} - - name: Package and Driver Report - if: inputs.runner-type == 'cuda' + - name: CUDA packages and driver report + if: ${{ startsWith(inputs.toolkit, 'cuda') }} shell: bash run: | sudo apt-get install -y ubuntu-drivers-common dkms diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 2909e8359..799c624b7 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -80,13 +80,19 @@ jobs: build_cuda_with_tests: if: github.repository == 'ml-explore/mlx' + strategy: + fail-fast: false + matrix: + toolkit: ['cuda-12.8', 'cuda-12.9'] runs-on: gpu-t4-4-core steps: - uses: actions/checkout@v5 - uses: ./.github/actions/setup-linux with: - runner-type: 'cuda' + toolkit: ${{ matrix.toolkit }} - uses: ./.github/actions/build-cuda + with: + toolkit: ${{ matrix.toolkit }} - uses: ./.github/actions/test-linux build_cuda_release: @@ -96,11 +102,11 @@ jobs: - uses: actions/checkout@v5 - uses: ./.github/actions/setup-linux with: - runner-type: 'cuda' + toolkit: 'cuda-12.9' - name: Build Python package uses: ./.github/actions/build-cuda-release with: - nvcc-location: '/usr/local/cuda-12.9/bin/nvcc' + toolkit: 'cuda-12.9' - name: Upload artifacts uses: actions/upload-artifact@v5 with: diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 3a0fadade..e658a120f 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -49,14 +49,20 @@ jobs: cuda_build_and_test: if: github.repository == 'ml-explore/mlx' + strategy: + fail-fast: false + matrix: + toolkit: ['cuda-12.8', 'cuda-12.9'] runs-on: gpu-t4-4-core needs: check_lint steps: - uses: actions/checkout@v5 - uses: ./.github/actions/setup-linux with: - runner-type: 'cuda' + toolkit: ${{ matrix.toolkit }} - uses: ./.github/actions/build-cuda + with: + toolkit: ${{ matrix.toolkit }} - uses: ./.github/actions/test-linux build_documentation: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f23dc572e..c4bb28ce1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -136,11 +136,11 @@ jobs: - uses: actions/checkout@v5 - uses: ./.github/actions/setup-linux with: - runner-type: 'cuda' + toolkit: 'cuda-12.9' - name: Build Python package uses: ./.github/actions/build-cuda-release with: - nvcc-location: '/usr/local/cuda-12.9/bin/nvcc' + toolkit: 'cuda-12.9' - name: Upload artifacts uses: actions/upload-artifact@v5 with: