use arch specific targets when possible (#2771 )

Separate test-linux from build-linux/cuda in GitHub Actions (#2765 )
* Separate test-linux from build-linux/cuda in GitHub Actions * Prefer unittest when possible Co-authored-by: Mike Drob <mdrob@apache.org> --------- Co-authored-by: Mike Drob <mdrob@apache.org>
2025-12-16 01:49:05 +08:00 · 2025-11-14 20:04:18 -08:00 · 2025-11-15 11:14:09 +09:00
10 changed files with 105 additions and 40 deletions
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -17,20 +17,6 @@ runs:
        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
      run: pip install -e ".[dev]" -v

-    - name: Run Python tests - CPU
-      shell: bash
-      env:
-        LOW_MEMORY: 1
-        DEVICE: cpu
-      run: python -m unittest discover python/tests -v
-
-    - name: Run Python tests - GPU
-      shell: bash
-      env:
-        LOW_MEMORY: 1
-        DEVICE: gpu
-      run: python -m tests discover python/tests -v
-
    - name: Build CPP only
      shell: bash
      run: |
@@ -39,7 +25,3 @@ runs:
          -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }} \
          -DCMAKE_BUILD_TYPE=DEBUG
        cmake --build build -j $(nproc)
-    
-    - name: Run CPP tests
-      shell: bash
-      run: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
--- a/.github/actions/build-linux/action.yml
+++ b/.github/actions/build-linux/action.yml
@@ -17,25 +17,9 @@ runs:
        pip install typing_extensions
        python setup.py generate_stubs
    
-    - name: Run Python tests
-      shell: bash
-      run: |
-        python -m unittest discover python/tests -v
-        mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
-        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
-        if grep -Fq '[WARN]' stderr.log ; then
-          grep -F '[WARN]' stderr.log
-          echo "Distributed ring test failed";
-          exit 1;
-        fi
-    
    - name: Build CPP only
      shell: bash
      run: |
        mkdir -p build && cd build
        cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
        make -j $(nproc)
-    
-    - name: Run CPP tests
-      shell: sh
-      run: ./build/tests/tests
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -46,7 +46,6 @@ runs:
        pip install --upgrade pip cmake

    - name: Install MPI
-      if: inputs.runner-type == 'linux'
      shell: bash
      run: sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev

--- a/.github/actions/test-linux/action.yml
+++ b/.github/actions/test-linux/action.yml
@@ -0,0 +1,63 @@
+name: 'Run Linux tests'
+
+inputs:
+  cpu-only:
+    description: 'Skip GPU tests'
+    required: false
+    default: false
+
+runs:
+  using: "composite"
+  steps:
+    - name: Run distributed tests
+      # FIXME: This test fails with CUDA build.
+      if: ${{ inputs.cpu-only == 'true' }}
+      shell: bash
+      run: |
+        echo "::group::Distributed tests"
+        mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
+        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+        if grep -Fq '[WARN]' stderr.log ; then
+          grep -F '[WARN]' stderr.log
+          echo "Distributed ring test failed";
+          exit 1;
+        fi
+        echo "::endgroup::"
+
+    - name: Run Python tests - CPU
+      shell: bash
+      env:
+        DEVICE: cpu
+      run: |
+        echo "::group::Python tests - CPU"
+        python -m unittest discover python/tests -v
+        echo "::endgroup::"
+
+    - name: Run Python tests - GPU
+      if: ${{ !inputs.cpu-only }}
+      shell: bash
+      env:
+        DEVICE: gpu
+      run: |
+        echo "::group::Python tests - GPU"
+        python -m tests discover python/tests -v
+        echo "::endgroup::"
+
+    - name: Run CPP tests - CPU
+      shell: bash
+      env:
+        DEVICE: cpu
+      run: |
+        echo "::group::CPP tests - CPU"
+        ./build/tests/tests
+        echo "::endgroup::"
+
+    - name: Run CPP tests - GPU
+      if: ${{ !inputs.cpu-only }}
+      shell: bash
+      env:
+        DEVICE: gpu
+      run: |
+        echo "::group::CPP tests - GPU"
+        ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
+        echo "::endgroup::"
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -51,6 +51,9 @@ jobs:
        with:
          python-version: ${{ matrix.python_version }}
      - uses: ./.github/actions/build-linux
+      - uses: ./.github/actions/test-linux
+        with:
+          cpu-only: true

  build_mac_release:
    if: github.repository == 'ml-explore/mlx'
@@ -85,6 +88,7 @@ jobs:
        with:
          runner-type: 'cuda'
      - uses: ./.github/actions/build-cuda
+      - uses: ./.github/actions/test-linux

  build_cuda_release:
    if: github.repository == 'ml-explore/mlx'
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -25,6 +25,9 @@ jobs:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
      - uses: ./.github/actions/build-linux
+      - uses: ./.github/actions/test-linux
+        with:
+          cpu-only: true

  mac_build_and_test:
    if: github.repository == 'ml-explore/mlx'
@@ -50,6 +53,7 @@ jobs:
        with:
          runner-type: 'cuda'
      - uses: ./.github/actions/build-cuda
+      - uses: ./.github/actions/test-linux

  build_documentation:
    if: github.repository == 'ml-explore/mlx'
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -126,7 +126,11 @@ endif()
 # Compute capability >= 7.0 is required for synchronization between CPU/GPU with
 # managed memory.
 if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
-  set(MLX_CUDA_ARCHITECTURES "native")
+  execute_process(
+    COMMAND bash detect_cuda_arch.sh
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
 endif()
 message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
--- a/mlx/backend/cuda/detect_cuda_arch.sh
+++ b/mlx/backend/cuda/detect_cuda_arch.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+arch=`__nvcc_device_query`
+case "$arch" in
+    "90")
+        echo "90a" ;;
+    "100")
+        echo "100a" ;;
+    "121")
+        echo "121a" ;;
+    *)
+        echo "native" ;;
+esac
--- a/mlx/backend/cuda/jit_module.cpp
+++ b/mlx/backend/cuda/jit_module.cpp
@@ -279,11 +279,14 @@ void compile(
  // Compile program.
  std::vector<const char*> args;
  bool use_sass = compiler_supports_device_sass(device);
+  auto cc = device.compute_capability_major();
+  std::string arch_tag = (cc == 90 || cc == 100 || cc == 121) ? "a" : "";
  std::string compute = fmt::format(
-      "--gpu-architecture={}_{}{}",
+      "--gpu-architecture={}_{}{}{}",
      use_sass ? "sm" : "compute",
-      device.compute_capability_major(),
-      device.compute_capability_minor());
+      cc,
+      device.compute_capability_minor(),
+      arch_tag);
  args.push_back(compute.c_str());
  std::string cccl_include = cccl_dir();
  if (!cccl_include.empty()) {
--- a/setup.py
+++ b/setup.py
@@ -89,7 +89,16 @@ class CMakeBuild(build_ext):
        ]
        if build_stage == 2 and build_cuda:
            # Last arch is always real and virtual for forward-compatibility
-            cuda_archs = ";".join(("70-real", "80-real", "90-real", "100-real", "120"))
+            cuda_archs = ";".join(
+                (
+                    "75-real",
+                    "80-real",
+                    "90a-real",
+                    "100a-real",
+                    "120a-real",
+                    "120-virtual",
+                )
+            )
            cmake_args += [f"-DMLX_CUDA_ARCHITECTURES={cuda_archs}"]

        # Some generators require explcitly passing config when building.
Author	SHA1	Message	Date
Awni Hannun	1bf605d56d	use arch specific targets when possible (#2771 )	2025-11-14 20:04:18 -08:00
Cheng	3c622ddd1d	Separate test-linux from build-linux/cuda in GitHub Actions (#2765 ) * Separate test-linux from build-linux/cuda in GitHub Actions * Prefer unittest when possible Co-authored-by: Mike Drob <mdrob@apache.org> --------- Co-authored-by: Mike Drob <mdrob@apache.org>	2025-11-15 11:14:09 +09:00