From c9f4dc851f4baa88f713cfc0215bc7b8fe2a87dc Mon Sep 17 00:00:00 2001 From: Cheng Date: Tue, 25 Nov 2025 20:06:42 +0900 Subject: [PATCH] Merge build-cuda and build-linux actions (#2783) --- .github/actions/build-cuda-release/action.yml | 7 +-- .github/actions/build-cuda/action.yml | 26 --------- .github/actions/build-linux/action.yml | 30 +++++++--- .github/actions/setup-linux/action.yml | 7 ++- .github/actions/test-linux/action.yml | 12 ++-- .../{pull_request.yml => build_and_test.yml} | 55 ++++++++++--------- .github/workflows/nightly.yml | 2 - .github/workflows/release.yml | 2 - mlx/backend/cuda/CMakeLists.txt | 15 +++-- mlx/backend/cuda/detect_cuda_arch.sh | 13 ----- mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp | 45 +++++++++++++++ 11 files changed, 120 insertions(+), 94 deletions(-) delete mode 100644 .github/actions/build-cuda/action.yml rename .github/workflows/{pull_request.yml => build_and_test.yml} (79%) delete mode 100644 mlx/backend/cuda/detect_cuda_arch.sh diff --git a/.github/actions/build-cuda-release/action.yml b/.github/actions/build-cuda-release/action.yml index 2976fa231..d3fe4c301 100644 --- a/.github/actions/build-cuda-release/action.yml +++ b/.github/actions/build-cuda-release/action.yml @@ -1,18 +1,13 @@ name: 'Build CUDA wheel' description: 'Build CUDA wheel' -inputs: - toolkit: - description: 'The CUDA toolkit' - required: true - runs: using: "composite" steps: - name: Build package shell: bash env: - CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc + CMAKE_ARGS: -DMLX_BUILD_CUDA=ON run: | pip install auditwheel build patchelf setuptools python setup.py clean --all diff --git a/.github/actions/build-cuda/action.yml b/.github/actions/build-cuda/action.yml deleted file mode 100644 index 2a3b39883..000000000 --- a/.github/actions/build-cuda/action.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: 'Build and Test with CUDA' -description: 'Build and test MLX with CUDA' - -inputs: - toolkit: - description: 'The CUDA toolkit' - required: true - -runs: - using: "composite" - steps: - - name: Install Python package - shell: bash - env: - DEBUG: 1 - CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc - run: pip install --no-build-isolation -e ".[dev]" -v - - - name: Build CPP only - shell: bash - run: | - cmake . -B build \ - -DMLX_BUILD_CUDA=ON \ - -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc \ - -DCMAKE_BUILD_TYPE=DEBUG - cmake --build build -j $(nproc) diff --git a/.github/actions/build-linux/action.yml b/.github/actions/build-linux/action.yml index 6273ab8de..337b94a2e 100644 --- a/.github/actions/build-linux/action.yml +++ b/.github/actions/build-linux/action.yml @@ -1,25 +1,41 @@ name: 'Build and Test on Linux' -description: 'Build and test MLX on Linux' + +inputs: + toolkit: + description: 'The toolkit to build with' + required: false + default: 'cpu' runs: using: "composite" steps: - name: Install Python package + id: python_build shell: sh env: - CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" DEBUG: 1 - run: pip install --no-build-isolation -e ".[dev]" -v + CMAKE_ARGS: >- + -DCMAKE_COMPILE_WARNING_AS_ERROR=ON + -DMLX_BUILD_CUDA=${{ startsWith(inputs.toolkit, 'cuda') && 'ON' || 'OFF' }} + run: | + if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then + # There is no GPU in arm64 runner, use a common arch. + CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=90a" + # Can not build tests when the built executables can not run. + CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF" + fi + pip install --no-build-isolation -e ".[dev]" -v + # Pass the CMAKE_ARGS to following steps. + echo CMAKE_ARGS="$CMAKE_ARGS" >> $GITHUB_OUTPUT - name: Generate package stubs shell: sh run: | pip install typing_extensions python setup.py generate_stubs - + - name: Build CPP only shell: bash run: | - mkdir -p build && cd build - cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG - make -j $(nproc) + cmake . -B build -DCMAKE_BUILD_TYPE=Debug ${{ steps.python_build.outputs.CMAKE_ARGS }} + cmake --build build -j $(nproc) diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml index f6cfe1060..721a097a3 100644 --- a/.github/actions/setup-linux/action.yml +++ b/.github/actions/setup-linux/action.yml @@ -51,8 +51,6 @@ runs: # Note: the CI machine does not meet CUDA 13's driver requirement. # Compatibility matrix: # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html - # The `nvcc` is installed into `/usr/local/cuda-VERSION/bin/nvcc` - but - # it's *not* on the default toolkit path. PACKAGES: | { "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6", @@ -60,13 +58,16 @@ runs: "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0" } run: | - export ARCH=${{ runner.arch == 'arm64' && 'arm64' || 'x86_64' }} + # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is + # Jetson specific. SBSA means Arm Server Base System Architecture. + ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }} wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update sudo apt-get install -y \ libnccl2 libnccl-dev \ ${{ fromJson(env.PACKAGES)[inputs.toolkit] }} + echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH - name: CUDA packages and driver report if: ${{ startsWith(inputs.toolkit, 'cuda') }} diff --git a/.github/actions/test-linux/action.yml b/.github/actions/test-linux/action.yml index 441da7d92..3258670ef 100644 --- a/.github/actions/test-linux/action.yml +++ b/.github/actions/test-linux/action.yml @@ -1,8 +1,8 @@ name: 'Run Linux tests' inputs: - cpu-only: - description: 'Skip GPU tests' + has-gpu: + description: 'Run GPU tests' required: false default: false @@ -17,7 +17,7 @@ runs: echo "::endgroup::" - name: Run distributed tests - if: ${{ inputs.cpu-only == 'true' }} + if: ${{ inputs.has-gpu == 'false' }} shell: bash run: | echo "::group::Distributed tests" @@ -30,7 +30,7 @@ runs: echo "::endgroup::" - name: Run Python tests - CPU - if: ${{ inputs.cpu-only == 'true' }} + if: ${{ inputs.has-gpu == 'false' }} shell: bash env: DEVICE: cpu @@ -40,7 +40,7 @@ runs: echo "::endgroup::" - name: Run Python tests - GPU - if: ${{ inputs.cpu-only == 'false' }} + if: ${{ inputs.has-gpu == 'true' }} shell: bash env: DEVICE: gpu @@ -59,7 +59,7 @@ runs: echo "::endgroup::" - name: Run CPP tests - GPU - if: ${{ inputs.cpu-only == 'false' }} + if: ${{ inputs.has-gpu == 'true' }} shell: bash env: DEVICE: gpu diff --git a/.github/workflows/pull_request.yml b/.github/workflows/build_and_test.yml similarity index 79% rename from .github/workflows/pull_request.yml rename to .github/workflows/build_and_test.yml index db63a9ad1..34ff55054 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/build_and_test.yml @@ -17,29 +17,51 @@ concurrency: jobs: check_lint: + name: Check Lint runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v6 - uses: pre-commit/action@v3.0.1 linux_build_and_test: + name: Linux (cpu, ${{ matrix.arch }}) needs: check_lint strategy: - matrix: - runner: - - ubuntu-22.04 - - ubuntu-22.04-arm fail-fast: false - runs-on: ${{ matrix.runner }} + matrix: + arch: ['x86_64', 'aarch64'] + runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }} steps: - uses: actions/checkout@v6 - uses: ./.github/actions/setup-linux - uses: ./.github/actions/build-linux - uses: ./.github/actions/test-linux + + cuda_build_and_test: + name: Linux (${{ matrix.toolkit }}, ${{ matrix.arch }}) + if: github.repository == 'ml-explore/mlx' + needs: check_lint + strategy: + fail-fast: false + matrix: + arch: ['x86_64', 'aarch64'] + toolkit: ['cuda-12.6', 'cuda-12.9'] + runs-on: ${{ matrix.arch == 'x86_64' && 'gpu-t4-4-core' || 'ubuntu-22.04-arm' }} + steps: + - uses: actions/checkout@v6 + - uses: ./.github/actions/setup-linux with: - cpu-only: true + toolkit: ${{ matrix.toolkit }} + - uses: ./.github/actions/build-linux + with: + toolkit: ${{ matrix.toolkit }} + - uses: ./.github/actions/test-linux + if: matrix.arch == 'x86_64' + with: + has-gpu: true mac_build_and_test: + name: macOS (${{ matrix.macos-target }}) if: github.repository == 'ml-explore/mlx' strategy: matrix: @@ -53,25 +75,8 @@ jobs: - uses: ./.github/actions/setup-macos - uses: ./.github/actions/build-macos - cuda_build_and_test: - if: github.repository == 'ml-explore/mlx' - strategy: - fail-fast: false - matrix: - toolkit: ['cuda-12.6', 'cuda-12.9'] - runs-on: gpu-t4-4-core - needs: check_lint - steps: - - uses: actions/checkout@v6 - - uses: ./.github/actions/setup-linux - with: - toolkit: ${{ matrix.toolkit }} - - uses: ./.github/actions/build-cuda - with: - toolkit: ${{ matrix.toolkit }} - - uses: ./.github/actions/test-linux - build_documentation: + name: Build Documentation if: github.repository == 'ml-explore/mlx' runs-on: ubuntu-22.04 needs: check_lint @@ -80,7 +85,7 @@ jobs: - uses: ./.github/actions/build-docs linux_fedora_build_cpp: - name: Linux Fedora CPP Build (${{ matrix.arch }}) + name: Linux Fedora (${{ matrix.arch }}) needs: check_lint strategy: fail-fast: false diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index c79c07ea1..479748f8a 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -52,8 +52,6 @@ jobs: python-version: ${{ matrix.python_version }} - uses: ./.github/actions/build-linux - uses: ./.github/actions/test-linux - with: - cpu-only: true build_mac_release: if: github.repository == 'ml-explore/mlx' diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 27bd7081a..5cc99dac2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -139,8 +139,6 @@ jobs: toolkit: 'cuda-12.9' - name: Build Python package uses: ./.github/actions/build-cuda-release - with: - toolkit: 'cuda-12.9' - name: Upload artifacts uses: actions/upload-artifact@v5 with: diff --git a/mlx/backend/cuda/CMakeLists.txt b/mlx/backend/cuda/CMakeLists.txt index e11a18f95..7986c09d8 100644 --- a/mlx/backend/cuda/CMakeLists.txt +++ b/mlx/backend/cuda/CMakeLists.txt @@ -123,14 +123,21 @@ if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0) mlx PRIVATE "$<$:--compress-mode=size>") endif() -# Compute capability >= 7.0 is required for synchronization between CPU/GPU with -# managed memory. +# Use native CUDA arch by default. if(NOT DEFINED MLX_CUDA_ARCHITECTURES) execute_process( - COMMAND bash detect_cuda_arch.sh - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMAND __nvcc_device_query OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES OUTPUT_STRIP_TRAILING_WHITESPACE) + set(UPGRADABLE_ARCHITECTURES "90;100;121") + if(MLX_CUDA_ARCHITECTURES STREQUAL "") + message( + FATAL_ERROR + "Can not get native CUDA arch, must set MLX_CUDA_ARCHITECTURES") + elseif(MLX_CUDA_ARCHITECTURES IN_LIST UPGRADABLE_ARCHITECTURES) + # Use arch-specific compute capability whenever possible. + set(MLX_CUDA_ARCHITECTURES "${MLX_CUDA_ARCHITECTURES}a") + endif() endif() message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}") set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES diff --git a/mlx/backend/cuda/detect_cuda_arch.sh b/mlx/backend/cuda/detect_cuda_arch.sh deleted file mode 100644 index 9d7c01a3e..000000000 --- a/mlx/backend/cuda/detect_cuda_arch.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -arch=`__nvcc_device_query` -case "$arch" in - "90") - echo "90a" ;; - "100") - echo "100a" ;; - "121") - echo "121a" ;; - *) - echo "native" ;; -esac diff --git a/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp b/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp index 171340f67..435936236 100644 --- a/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp +++ b/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp @@ -5,3 +5,48 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId*) { return ncclSuccess; } + +const char* ncclGetErrorString(ncclResult_t result) { + return nullptr; +} + +ncclResult_t +ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) { + return ncclSuccess; +} + +ncclResult_t ncclCommDestroy(ncclComm_t comm) { + return ncclSuccess; +} + +ncclResult_t ncclAllGather( + const void* sendbuff, + void* recvbuff, + size_t sendcount, + ncclDataType_t datatype, + ncclComm_t comm, + cudaStream_t stream) { + return ncclSuccess; +} + +ncclResult_t ncclAllReduce( + const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + ncclRedOp_t op, + ncclComm_t comm, + cudaStream_t stream) { + return ncclSuccess; +} + +ncclResult_t ncclReduceScatter( + const void* sendbuff, + void* recvbuff, + size_t recvcount, + ncclDataType_t datatype, + ncclRedOp_t op, + ncclComm_t comm, + cudaStream_t stream) { + return ncclSuccess; +}