Build and test with multiple CUDA versions (#2780)

This commit is contained in:
Cheng
2025-11-17 09:19:02 +09:00
committed by GitHub
parent b7214ff01e
commit 472c43a0c8
6 changed files with 52 additions and 33 deletions

View File

@@ -2,8 +2,8 @@ name: 'Build CUDA wheel'
description: 'Build CUDA wheel'
inputs:
nvcc-location:
description: 'Location of nvcc compiler'
toolkit:
description: 'The CUDA toolkit'
required: true
runs:
@@ -12,7 +12,7 @@ runs:
- name: Build package
shell: bash
env:
CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
run: |
pip install auditwheel build patchelf setuptools
python setup.py clean --all

View File

@@ -2,10 +2,9 @@ name: 'Build and Test with CUDA'
description: 'Build and test MLX with CUDA'
inputs:
nvcc-location:
description: 'Location of nvcc compiler'
toolkit:
description: 'The CUDA toolkit'
required: true
default: '/usr/local/cuda-12.9/bin/nvcc'
runs:
using: "composite"
@@ -14,7 +13,7 @@ runs:
shell: bash
env:
DEBUG: 1
CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
run: pip install --no-build-isolation -e ".[dev]" -v
- name: Build CPP only
@@ -22,6 +21,6 @@ runs:
run: |
cmake . -B build \
-DMLX_BUILD_CUDA=ON \
-DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }} \
-DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc \
-DCMAKE_BUILD_TYPE=DEBUG
cmake --build build -j $(nproc)

View File

@@ -2,14 +2,10 @@ name: 'Setup Linux Environment'
description: 'Install dependencies for Linux builds'
inputs:
runner-type:
description: 'Whether to set this up as a linux or CUDA runner'
toolkit:
description: 'Which toolkit to install'
required: false
default: 'linux'
type: choice
options:
- linux
- cuda
default: 'cpu'
python-version:
description: 'Version of python to set up'
required: false
@@ -21,7 +17,7 @@ runs:
- name: Use ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
key: ccache-${{ inputs.runner-type }}-${{ runner.arch }}-py${{ inputs.python-version }}
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}-py${{ inputs.python-version }}
max-size: 1GB
- name: Install common dependencies
@@ -48,21 +44,33 @@ runs:
shell: bash
run: sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev
- name: Network CUDA installation from packages
if: inputs.runner-type == 'cuda'
shell: bash ## Specific to Ubuntu 22.04 & Architecture x86_64
- name: Install CUDA toolkit
if: ${{ startsWith(inputs.toolkit, 'cuda') }}
shell: bash
env:
# Note: the CI machine does not meet CUDA 13's driver requirement.
# Compatibility matrix:
# https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
# The `nvcc` is installed into `/usr/local/cuda-VERSION/bin/nvcc` - but
# it's *not* on the default toolkit path.
PACKAGES: |
{
"cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6",
"cuda-12.8": "libcudnn9-dev-cuda-12 cuda-toolkit-12-8",
"cuda-12.9": "libcudnn9-dev-cuda-12 cuda-toolkit-12-9",
"cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0"
}
run: |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
export ARCH=${{ runner.arch == 'arm64' && 'arm64' || 'x86_64' }}
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y libcudnn9-dev-cuda-12 libnccl2 libnccl-dev cuda-toolkit-12-9
# Note: This installs CUDA 12.9, which is the latest supported by cuDNN 9.x and works with the NVidia 570 drivers
# cuda-toolkit by itself installs version 13 (+) and requires updated drives (580+), which require a reboot to function properly.
# Compatibility matrix: https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
# This also drops `nvcc` into `/usr/local/cuda-12.9/bin/nvcc` - but it's *not* on the default PATH
sudo apt-get install -y \
libnccl2 libnccl-dev \
${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
- name: Package and Driver Report
if: inputs.runner-type == 'cuda'
- name: CUDA packages and driver report
if: ${{ startsWith(inputs.toolkit, 'cuda') }}
shell: bash
run: |
sudo apt-get install -y ubuntu-drivers-common dkms

View File

@@ -80,13 +80,19 @@ jobs:
build_cuda_with_tests:
if: github.repository == 'ml-explore/mlx'
strategy:
fail-fast: false
matrix:
toolkit: ['cuda-12.8', 'cuda-12.9']
runs-on: gpu-t4-4-core
steps:
- uses: actions/checkout@v5
- uses: ./.github/actions/setup-linux
with:
runner-type: 'cuda'
toolkit: ${{ matrix.toolkit }}
- uses: ./.github/actions/build-cuda
with:
toolkit: ${{ matrix.toolkit }}
- uses: ./.github/actions/test-linux
build_cuda_release:
@@ -96,11 +102,11 @@ jobs:
- uses: actions/checkout@v5
- uses: ./.github/actions/setup-linux
with:
runner-type: 'cuda'
toolkit: 'cuda-12.9'
- name: Build Python package
uses: ./.github/actions/build-cuda-release
with:
nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
toolkit: 'cuda-12.9'
- name: Upload artifacts
uses: actions/upload-artifact@v5
with:

View File

@@ -49,14 +49,20 @@ jobs:
cuda_build_and_test:
if: github.repository == 'ml-explore/mlx'
strategy:
fail-fast: false
matrix:
toolkit: ['cuda-12.8', 'cuda-12.9']
runs-on: gpu-t4-4-core
needs: check_lint
steps:
- uses: actions/checkout@v5
- uses: ./.github/actions/setup-linux
with:
runner-type: 'cuda'
toolkit: ${{ matrix.toolkit }}
- uses: ./.github/actions/build-cuda
with:
toolkit: ${{ matrix.toolkit }}
- uses: ./.github/actions/test-linux
build_documentation:

View File

@@ -136,11 +136,11 @@ jobs:
- uses: actions/checkout@v5
- uses: ./.github/actions/setup-linux
with:
runner-type: 'cuda'
toolkit: 'cuda-12.9'
- name: Build Python package
uses: ./.github/actions/build-cuda-release
with:
nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
toolkit: 'cuda-12.9'
- name: Upload artifacts
uses: actions/upload-artifact@v5
with: