Fix the order of hosts in the ring (#2718 )

fix memory count bug (#2717 )
Fix missing domain_uuid_key in thunderbolt ring setup (#2682 )
2025-12-16 01:49:05 +08:00 · 2025-10-30 15:02:39 -07:00 · 2025-10-30 14:27:15 -07:00 · 2025-10-30 13:17:20 -07:00 · 2025-10-30 12:26:55 -05:00
17 changed files with 862 additions and 23 deletions
--- a/.github/actions/build-cuda-release/action.yml
+++ b/.github/actions/build-cuda-release/action.yml
@@ -0,0 +1,24 @@
+name: 'Build CUDA wheel'
+description: 'Build CUDA wheel'
+
+inputs:
+  nvcc-location:
+    description: 'Location of nvcc compiler'
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Build package
+      shell: bash
+      env:
+        MLX_BUILD_STAGE: 2
+        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
+      run: |
+        pip install auditwheel build patchelf setuptools
+        python setup.py clean --all
+        python -m build -w
+
+        if [ -f "python/scripts/repair_cuda.sh" ]; then
+          bash python/scripts/repair_cuda.sh
+        fi
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -0,0 +1,68 @@
+name: 'Build and Test with CUDA'
+description: 'Build and test MLX with CUDA'
+
+inputs:
+  build-type:
+    description: 'Build type (debug, release)'
+    required: false
+    default: 'debug'
+  run-tests:
+    description: 'Whether to run tests'
+    required: false
+    default: 'true'
+  nvcc-location:
+    description: 'Location of nvcc compiler'
+    required: true
+    default: '/usr/local/cuda-12.9/bin/nvcc'
+    # this value is dependent on the CUDA tools installed in the setup-linux workflow
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install Python package
+      shell: bash
+      env:
+        DEBUG: 1
+        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
+      run: pip install -e ".[dev]" -v
+
+    - name: Check if build actually worked
+      shell: bash
+      run: python -c "import mlx.core"
+
+    - name: Run Python tests - CPU
+      if: inputs.run-tests == 'true'
+      shell: bash
+      env:
+        LOW_MEMORY: 1
+        DEVICE: cpu
+      run: python -m unittest discover python/tests -v
+
+    - name: Run Python tests - GPU
+      if: inputs.run-tests == 'true'
+      shell: bash
+      env:
+        LOW_MEMORY: 1
+        DEVICE: gpu
+      run: python -m tests discover python/tests -v
+
+    - name: Build CPP only
+      if: inputs.build-type == 'debug'
+      shell: bash
+      run: |
+        cmake . -B build \
+          -DMLX_BUILD_CUDA=ON \
+          -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }} \
+          -DCMAKE_BUILD_TYPE=DEBUG
+        cmake --build build -j $(nproc)
+    
+    - name: Run CPP tests
+      if: ${{ inputs.build-type == 'debug' && inputs.run-tests == 'true' }}
+      shell: bash
+      run: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
+
+    - name: Build Python package
+      if: inputs.build-type == 'release'
+      uses: ./.github/actions/build-cuda-release
+      with:
+        nvcc-location: ${{ inputs.nvcc-location }}
--- a/.github/actions/build-docs/action.yml
+++ b/.github/actions/build-docs/action.yml
@@ -0,0 +1,38 @@
+name: 'Build Documentation'
+description: 'Build documentation on a mac'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Setup machine
+      uses: ./.github/actions/setup-macos
+
+    - name: Install dependencies
+      shell: sh
+      run: |
+        brew install doxygen
+        uv pip install --upgrade pip cmake
+        uv pip install -r docs/requirements.txt
+        uv pip install . -v
+  
+    - name: Build documentation
+      shell: bash
+      run: |
+        source .venv/bin/activate
+        cd docs
+        doxygen
+        make html O=-W
+    
+    - name: Create artifact tar
+      shell: sh
+      run: tar -cf artifact.tar --cd docs/build/html -L .
+
+    # Do it manually because upload-pages-artifact requires gtar
+    - name: Upload artifact
+      id: upload-artifact
+      uses: actions/upload-artifact@v5
+      with:
+        name: github-pages
+        path: artifact.tar
+        retention-days: 1
+        if-no-files-found: error
--- a/.github/actions/build-linux/action.yml
+++ b/.github/actions/build-linux/action.yml
@@ -0,0 +1,78 @@
+name: 'Build and Test on Linux'
+description: 'Build and test MLX on Linux'
+
+inputs:
+  build-type:
+    description: 'Build type'
+    required: false
+    default: 'debug'
+    type: choice
+    options:
+      - debug
+      - release
+  run-tests:
+    description: 'Whether to run tests'
+    required: false
+    default: 'true'
+    type: boolean
+
+runs:
+  using: "composite"
+  steps:
+    - name: Set DEBUG
+      shell: sh
+      if: inputs.build-type == 'debug'
+      run: echo "DEBUG=1" >> $GITHUB_ENV
+
+    - name: Install Python package
+      shell: sh
+      env:
+        CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
+      run: pip install -e ".[dev]" -v
+    
+    - name: Generate package stubs
+      shell: sh
+      run: |
+        pip install typing_extensions
+        python setup.py generate_stubs
+    
+    - name: Run Python tests
+      if: inputs.run-tests == 'true'
+      shell: bash
+      run: |
+        python -m unittest discover python/tests -v
+        mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
+        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+        if grep -Fq '[WARN]' stderr.log ; then
+          grep -F '[WARN]' stderr.log
+          echo "Distributed ring test failed";
+          exit 1;
+        fi
+    
+    - name: Build CPP only
+      if: inputs.build-type == 'debug'
+      shell: bash
+      run: |
+        mkdir -p build && cd build
+        cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
+        make -j $(nproc)
+    
+    - name: Run CPP tests
+      if: ${{ inputs.build-type == 'debug' && inputs.run-tests == 'true' }}
+      shell: sh
+      run: ./build/tests/tests
+    
+    - name: Build Python package
+      if: inputs.build-type == 'release'
+      shell: bash
+      run: |
+        pip install auditwheel patchelf build
+        python setup.py clean --all
+        MLX_BUILD_STAGE=1 python -m build -w
+        if [ -f "python/scripts/repair_linux.sh" ]; then
+          bash python/scripts/repair_linux.sh
+        fi
+
+        python setup.py clean --all
+        MLX_BUILD_STAGE=2 python -m build -w
+        auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
--- a/.github/actions/build-macos-release/action.yml
+++ b/.github/actions/build-macos-release/action.yml
@@ -0,0 +1,22 @@
+name: 'Build macOS release'
+description: 'Build MLX releases macOS'
+
+inputs:
+  macos-target:
+    description: 'macOS build target'
+    required: false
+    default: '15.0'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Build Python package(s)
+      shell: bash
+      env:
+        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
+      run: |
+        uv pip install build
+        uv run --no-project setup.py clean --all
+        MLX_BUILD_STAGE=1 uv run -m build -w
+        uv run --no-project setup.py clean --all
+        MLX_BUILD_STAGE=2 uv run -m build -w
--- a/.github/actions/build-macos/action.yml
+++ b/.github/actions/build-macos/action.yml
@@ -0,0 +1,124 @@
+name: 'Build and Test on macOS'
+description: 'Build and test MLX on macOS'
+
+inputs:
+  build-type:
+    description: 'Build type (debug, release)'
+    required: false
+    default: 'debug'
+    type: choice
+    options:
+      - debug
+      - release
+  run-tests:
+    description: 'Whether to run tests'
+    required: false
+    default: 'true'
+  build-jit:
+    description: 'Whether to build with JIT'
+    required: false
+    default: 'true'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install dependencies
+      shell: sh
+      env:
+        DEBUG: 1
+        DEV_RELEASE: 1
+      run: |
+        uv pip install --upgrade pip cmake setuptools
+        uv pip install nanobind==2.4.0 \
+          numpy torch tensorflow unittest-xml-reporting
+        uv pip install -e . -v
+
+    - name: Generate package stubs
+      shell: bash
+      run: |
+        uv pip install typing_extensions
+        uv run --no-project setup.py generate_stubs
+
+    - name: Run Python tests
+      if: inputs.run-tests == 'true'
+      shell: bash
+      env:
+        LOW_MEMORY: 1
+      run: |
+        DEVICE=cpu uv run -m xmlrunner discover -v python/tests -o test-results/cpu
+        DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 uv run -m xmlrunner discover -v python/tests -o test-results/gpu
+        mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
+        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+        if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
+    
+    - name: Build example extension
+      if: inputs.run-tests == 'true'
+      shell: bash
+      run: |
+        cd examples/extensions
+        uv pip install -r requirements.txt
+        uv run --no-project setup.py build_ext --inplace
+        uv run --no-project test.py
+    
+    - name: Build CPP only
+      if: inputs.build-type == 'debug'
+      shell: bash
+      run: |
+        mkdir -p build
+        cd build
+        cmake ..
+        make -j $(sysctl -n hw.ncpu)
+    
+    - name: Run CPP tests
+      if: ${{ inputs.build-type == 'debug' && inputs.run-tests == 'true' }}
+      shell: bash
+      env:
+        DEVICE: gpu
+        METAL_DEVICE_WRAPPER_TYPE: 1
+        METAL_DEBUG_ERROR_MODE: 0
+      run: ./build/tests/tests
+    
+    - name: Build small binary with JIT
+      if: inputs.build-jit == 'true'
+      shell: bash
+      run: |
+        mkdir -p build
+        cd build
+        cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
+          -DBUILD_SHARED_LIBS=ON \
+          -DMLX_BUILD_CPU=OFF \
+          -DMLX_BUILD_SAFETENSORS=OFF \
+          -DMLX_BUILD_GGUF=OFF \
+          -DMLX_METAL_JIT=ON
+        make -j $(sysctl -n hw.ncpu)
+    
+    - name: Run Python tests with JIT
+      if: ${{ inputs.build-jit == 'true' && inputs.run-tests == 'true' }}
+      shell: bash
+      env:
+        LOW_MEMORY: 1
+        DEVICE: gpu
+        METAL_DEVICE_WRAPPER_TYPE: 1
+        METAL_DEBUG_ERROR_MODE: 0
+      run: |
+        CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+          uv pip install -e . -v
+        uv run -m xmlrunner discover \
+            -v python/tests \
+            -o test-results/gpu_jit
+
+    - name: Build macOS 13 package
+      if: inputs.build-type == 'release'
+      uses: ./.github/actions/build-macos-release
+      with:
+        macos-target: 13.0
+    - name: Build macOS 14 package
+      if: inputs.build-type == 'release'
+      uses: ./.github/actions/build-macos-release
+      with:
+        macos-target: 14.0
+    - name: Build macOS 15 package
+      if: inputs.build-type == 'release'
+      uses: ./.github/actions/build-macos-release
+      with:
+        macos-target: 15.0
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -0,0 +1,83 @@
+name: 'Setup Linux Environment'
+description: 'Install dependencies for Linux builds'
+
+inputs:
+  runner-type:
+    description: 'Whether to set this up as a linux or CUDA runner'
+    required: false
+    default: 'linux'
+    type: choice
+    options:
+      - linux
+      - cuda
+  python-version:
+    description: 'Version of python to set up'
+    required: false
+    default: '3.10'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Free disk space
+      shell: sh
+      if: inputs.runner-type == 'linux'
+      run: sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+
+    - name: Install common dependencies
+      env:
+        TZ: Etc/UTC
+      shell: bash
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev tzdata zip
+        sudo apt autoremove -y
+
+    - uses: actions/setup-python@v6
+      with:
+        python-version: ${{ inputs.python-version }}
+        cache: 'pip'
+
+    - name: setup python venv
+      shell: bash
+      run: |
+        python -m venv .venv
+        source .venv/bin/activate
+        echo PATH=$PATH >> $GITHUB_ENV
+        pip install --upgrade pip cmake
+
+    - name: Install MPI
+      if: inputs.runner-type == 'linux'
+      shell: bash
+      run: sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev
+
+    - name: Network CUDA installation from packages
+      id: install-cuda
+      if: inputs.runner-type == 'cuda'
+      env:
+        TZ: Etc/UTC
+      shell: bash ## Specific to Ubuntu 22.04 & Architecture x86_64
+      run: |
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        sudo dpkg -i cuda-keyring_1.1-1_all.deb
+        sudo apt-get update
+        sudo apt-get install -y libcudnn9-dev-cuda-12 libnccl2 libnccl-dev cuda-toolkit-12-9
+      # Note: This installs CUDA 12.9, which is the latest supported by cuDNN 9.x and works with the NVidia 570 drivers
+      # cuda-toolkit by itself installs version 13 (+) and requires updated drives (580+), which require a reboot to function properly.
+      # Compatibility matrix: https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
+      # This also drops `nvcc` into `/usr/local/cuda-12.9/bin/nvcc` - but it's *not* on the default PATH
+
+    - name: Package and Driver Report
+      if: inputs.runner-type == 'cuda'
+      shell: bash
+      run: |
+        sudo apt-get install -y ubuntu-drivers-common dkms
+        echo "NVIDIA Driver Packages Available:"
+        sudo ubuntu-drivers list --gpgpu
+        echo "NVIDIA Driver Version:"
+        cat /proc/driver/nvidia/version || echo "nvidia driver not found"
+        echo "Installed NVIDIA and CUDA packages:"
+        dpkg -l | egrep "cuda|nvidia" -i
+        echo "DKMS Status:"
+        dkms status || echo "dkms not found"
+        echo "NVIDIA-SMI Status:"
+        nvidia-smi || echo "nvidia-smi not found"
--- a/.github/actions/setup-macos/action.yml
+++ b/.github/actions/setup-macos/action.yml
@@ -0,0 +1,31 @@
+name: 'Setup macOS Environment'
+description: 'Install dependencies for macOS builds'
+
+inputs:
+  install-mpi:
+    description: 'Whether to install MPI'
+    required: false
+    default: 'true'
+    type: boolean
+  python-version:
+    description: 'Python version to use'
+    required: false
+    default: '3.10'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install Homebrew packages
+      shell: sh
+      if: inputs.install-mpi == 'true'
+      run: /opt/homebrew/bin/brew install openmpi
+    
+    - name: Verify MetalToolchain installed
+      shell: bash
+      run: xcodebuild -showComponent MetalToolchain
+    
+    - name: Setup uv
+      uses: astral-sh/setup-uv@v6
+      with:
+          python-version: ${{ inputs.python-version }}
+          activate-environment: true
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -0,0 +1,28 @@
+name: Documentation
+
+on:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    runs-on: [self-hosted, macos]
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/build-docs
+      
+  deploy:
+    needs: build
+    permissions:
+      pages: write
+      id-token: write
+    runs-on: ubuntu-latest
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -0,0 +1,93 @@
+name: Nightly Build
+
+on:
+  schedule:
+    - cron: 33 6 * * 1-5
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  build_linux_release:
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.10", "3.14"]
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
+      - uses: ./.github/actions/build-linux
+        with:
+          build-type: release
+          run-tests: false
+      - name: Upload mlx artifacts
+        uses: actions/upload-artifact@v5
+        with:
+          name: linux-wheels-${{ matrix.python_version }}
+          path: wheelhouse/mlx-*.whl
+          retention-days: 7
+      - name: Upload mlx-cpu artifacts
+        if: matrix.python_version == '3.10'
+        uses: actions/upload-artifact@v5
+        with:
+          name: mlx-cpu
+          path: wheelhouse/mlx_cpu-*.whl
+          retention-days: 7
+  
+  build_linux_with_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
+        with:
+          python-version: ${{ matrix.python_version }}
+      - uses: ./.github/actions/build-linux
+
+  build_mac_release:
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.13"]
+        # TODO: 3.14 had issues finding a compatible tensorflow
+    env:
+      MACOSX_DEPLOYMENT_TARGET: "15.0"
+    runs-on: [self-hosted, macos]
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-macos
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: ./.github/actions/build-macos
+
+  build_cuda_with_tests:
+    runs-on: gpu-t4-4-core
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
+        with:
+          runner-type: 'cuda'
+      - uses: ./.github/actions/build-cuda
+
+  build_cuda_release:
+    runs-on: ubuntu-22-large
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
+        with:
+          runner-type: 'cuda'
+      - name: Build Python package
+        uses: ./.github/actions/build-cuda-release
+        with:
+          nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v5
+        with:
+          name: mlx-cuda
+          path: wheelhouse/mlx_cuda-*.whl
+          retention-days: 7
+
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -1,20 +1,46 @@
-on:
-  pull_request:
-    branches:
-      - main
+name: Build and Test
+
+on: pull_request  
+
+permissions:
+  contents: read

 jobs:
  check_lint:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
+      - uses: pre-commit/action@v3.0.1
+
+  linux_build_and_test:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
+      - uses: ./.github/actions/build-linux
+
+  mac_build_and_test:
+    runs-on: [self-hosted, macos]
+    needs: check_lint
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-macos
+      - uses: ./.github/actions/build-macos
+
+  cuda_build_and_test:
+    runs-on: gpu-t4-4-core
+    needs: check_lint
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
        with:
-          python-version: 3.8
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install pre-commit black isort clang-format
-      - name: Run lint
-        run: |
-          pre-commit run --all-files
+          runner-type: 'cuda'
+      - uses: ./.github/actions/build-cuda
+
+  build_documentation:
+    runs-on: [self-hosted, macos]
+    needs: check_lint
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/build-docs
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -0,0 +1,188 @@
+name: PyPI Release
+
+on:
+  push:
+    tags:
+      - 'v*'
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  build_documentation:
+    runs-on: [self-hosted, macos]
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/build-docs
+    
+  deploy_documentation:
+    needs: build_documentation
+    permissions:
+      pages: write
+      id-token: write
+    runs-on: ubuntu-latest
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
+
+  build_linux_release:
+    strategy:
+      matrix:
+        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+    runs-on: ubuntu-22.04
+    env:
+      PYPI_RELEASE: 1
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
+        with:
+          python-version: ${{ matrix.python_version }}
+      - uses: ./.github/actions/build-linux
+        with:
+          build-type: release
+          run-tests: false
+      - name: Upload MLX artifacts
+        uses: actions/upload-artifact@v5
+        with:
+          name: linux-wheels-${{ matrix.python_version }}
+          path: wheelhouse/mlx-*.whl
+      - name: Upload CPU artifacts
+        if: matrix.python_version == '3.10'
+        uses: actions/upload-artifact@v5
+        with:
+          name: mlx-cpu
+          path: wheelhouse/mlx_cpu-*.whl
+  
+  build_mac_release:
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
+        # TODO: 3.14 had issues finding a compatible tensorflow
+    runs-on: [self-hosted, macos]
+    env:
+      PYPI_RELEASE: 1
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-macos
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: ./.github/actions/build-macos
+        with:
+          build-type: release
+      - name: Upload MLX artifacts
+        uses: actions/upload-artifact@v5
+        with:
+          name: mac-wheels-${{ matrix.python-version }}
+          path: dist/mlx-*.whl
+      - name: Upload Metal artifacts
+        if: matrix.python-version == '3.10'
+        uses: actions/upload-artifact@v5
+        with:
+          name: mlx-metal
+          path: dist/mlx_metal-*.whl
+
+  build_cuda_release:
+    runs-on: ubuntu-22-large
+    env:
+      PYPI_RELEASE: 1
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
+        with:
+          runner-type: 'cuda'
+      - name: Build Python package
+        uses: ./.github/actions/build-cuda-release
+        with:
+          nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v5
+        with:
+          name: mlx-cuda
+          path: wheelhouse/mlx_cuda-*.whl
+
+  pypi-publish:
+    name: Upload release to PyPI
+    runs-on: ubuntu-latest
+    needs: [build_linux_release, build_mac_release]
+    permissions:
+      id-token: write
+    environment:
+      name: pypi
+      url: https://pypi.org/p/mlx
+    steps:
+      - uses: actions/download-artifact@v6
+        with:
+          pattern: linux-wheels-*
+          merge-multiples: true
+          path: artifacts
+      - uses: actions/download-artifact@v6
+        with:
+          pattern: mac-wheels-*
+          merge-multiples: true
+          path: artifacts
+      - name: Display structure of downloaded files
+        run: ls -R artifacts
+      # - name: Publish package distributions to PyPI
+      #  uses: pypa/gh-action-pypi-publish@release/v1
+  
+  pypi-publish-cuda:
+    name: Upload CUDA release to PyPI
+    runs-on: ubuntu-latest
+    needs: build_cuda_release
+    permissions:
+      id-token: write
+    environment:
+      name: pypi
+      url: https://pypi.org/p/mlx-cuda
+    steps:
+      - uses: actions/download-artifact@v6
+        with:
+          name: mlx-cuda
+          path: artifacts
+      - name: Display structure of downloaded files
+        run: ls -R artifacts
+      # - name: Publish package distributions to PyPI
+      #  uses: pypa/gh-action-pypi-publish@release/v1
+
+  pypi-publish-cpu:
+    name: Upload CPU release to PyPI
+    runs-on: ubuntu-latest
+    needs: build_linux_release
+    permissions:
+      id-token: write
+    environment:
+      name: pypi
+      url: https://pypi.org/p/mlx-cpu
+    steps:
+      - uses: actions/download-artifact@v6
+        with:
+          name: mlx-cpu
+          path: artifacts
+      - name: Display structure of downloaded files
+        run: ls -R artifacts
+      # - name: Publish package distributions to PyPI
+      #  uses: pypa/gh-action-pypi-publish@release/v1
+
+  pypi-publish-metal:
+    name: Upload Metal release to PyPI
+    runs-on: ubuntu-latest
+    needs: build_mac_release
+    permissions:
+      id-token: write
+    environment:
+      name: pypi
+      url: https://pypi.org/p/mlx-metal
+    steps:
+      - uses: actions/download-artifact@v6
+        with:
+          name: mlx-metal
+          path: artifacts
+      - name: Display structure of downloaded files
+        run: ls -R artifacts
+      # - name: Publish package distributions to PyPI
+      #  uses: pypa/gh-action-pypi-publish@release/v1
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,10 @@
 repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+    -   id: check-yaml
+    # -   id: end-of-file-fixer
+    # -   id: trailing-whitespace
 -   repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v19.1.7
    hooks:
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -97,7 +97,6 @@ CudaAllocator::CudaAllocator()

 Buffer CudaAllocator::malloc(size_t size) {
  // Find available buffer from cache.
-  auto orig_size = size;
  std::unique_lock lock(mutex_);
  if (size <= small_block_size) {
    size = 8;
@@ -131,7 +130,7 @@ Buffer CudaAllocator::malloc(size_t size) {
    }
    lock.lock();
  }
-  active_memory_ += size;
+  active_memory_ += buf->size;
  peak_memory_ = std::max(active_memory_, peak_memory_);

  // Maintain the cache below the requested limit.
--- a/python/mlx/distributed_run.py
+++ b/python/mlx/distributed_run.py
@@ -567,13 +567,16 @@ def prepare_tb_ring(args, hosts):
        name = ""
        ports = []
        for t in c["SPThunderboltDataType"]:
+            uuid = t.get("domain_uuid_key")
+            if uuid is None:
+                continue
            name = t["device_name_key"]
-            uuid = t["domain_uuid_key"]
            tag = t["receptacle_1_tag"]["receptacle_id_key"]
-            if items := t.get("_items", []):
-                connected_to = items[0]["domain_uuid_key"]
-            else:
-                connected_to = None
+            items = t.get("_items", [])
+            connected_items = [item for item in items if "domain_uuid_key" in item]
+            connected_to = (
+                connected_items[0]["domain_uuid_key"] if connected_items else None
+            )
            iface = iface_map[f"Thunderbolt {tag}"]
            ports.append(ThunderboltPort(iface, uuid, connected_to))
        tb_hosts.append(ThunderboltHost(name, sorted(ports, key=lambda x: x.iface)))
@@ -633,9 +636,17 @@ def prepare_tb_ring(args, hosts):
            if ip0 > 255:
                raise ValueError("Ran out of available local IPs for the ring")

+    # Extract the host order from the first ring
+    hostmap = dict((r[0][0], r[1][0]) for r in rings[0])
+    first_host = min(hostmap.keys())
+    order = [first_host]
+    while hostmap[order[-1]] != first_host:
+        order.append(hostmap[order[-1]])
+
    # Create the hostfile
    hostfile = []
-    for i, h in enumerate(hosts):
+    for i in order:
+        h = hosts[i]
        host = {
            "ssh": h.ssh_hostname,
            "ips": [
--- a/python/tests/test_memory.py
+++ b/python/tests/test_memory.py
@@ -58,6 +58,20 @@ class TestMemory(mlx_tests.MLXTestCase):
        with self.assertRaises(ValueError):
            mx.set_wired_limit(max_size + 10)

+    def test_active_memory_count(self):
+        mx.synchronize()
+        mx.clear_cache()
+        init_mem = mx.get_active_memory()
+        a = mx.zeros((128, 128))
+        mx.eval(a)
+        mx.synchronize()
+        del a
+        a = mx.zeros((90, 128))
+        mx.eval(a)
+        mx.synchronize()
+        del a
+        self.assertEqual(init_mem, mx.get_active_memory())
+

 if __name__ == "__main__":
    mlx_tests.MLXTestRunner()
Author	SHA1	Message	Date
Angelos Katharopoulos	b901a9f311	Fix the order of hosts in the ring (#2718 ) Some checks failed Nightly Build / build_linux_release (3.10) (push) Has been cancelled Details Nightly Build / build_linux_release (3.14) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.10) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.11) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.12) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.13) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.14) (push) Has been cancelled Details Nightly Build / build_mac_release (3.10) (push) Has been cancelled Details Nightly Build / build_mac_release (3.13) (push) Has been cancelled Details Nightly Build / build_cuda_with_tests (push) Has been cancelled Details Nightly Build / build_cuda_release (push) Has been cancelled Details	2025-10-30 15:02:39 -07:00
Awni Hannun	68c5fa1c95	fix memory count bug (#2717 )	2025-10-30 14:27:15 -07:00
Christopher Webb	793a31eeb6	Fix missing domain_uuid_key in thunderbolt ring setup (#2682 )	2025-10-30 13:17:20 -07:00
Mike Drob	74c1ed25bb	Migrate CircleCI to GitHub Actions (#2716 ) Co-authored-by: Joseph Heck <j_heck@apple.com>	2025-10-30 12:26:55 -05:00