fix release 2

2025-12-16 01:49:05 +08:00 · 2025-11-15 13:02:21 -08:00
169 changed files with 1636 additions and 12088 deletions
--- a/.github/actions/build-cuda-release/action.yml
+++ b/.github/actions/build-cuda-release/action.yml
@@ -2,13 +2,9 @@ name: 'Build CUDA wheel'
 description: 'Build CUDA wheel'
 inputs:
-  arch:
+  nvcc-location:
-    description: 'Platform architecture tag'
+    description: 'Location of nvcc compiler'
    required: true
    type: choice
    options:
      - x86_64
      - aarch64
 runs:
  using: "composite"
@@ -16,9 +12,9 @@ runs:
    - name: Build package
      shell: bash
      env:
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON
+        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
      run: |
        pip install auditwheel build patchelf setuptools
        python setup.py clean --all
        MLX_BUILD_STAGE=2 python -m build -w
-        bash python/scripts/repair_cuda.sh ${{ inputs.arch }}
+        bash python/scripts/repair_cuda.sh
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -0,0 +1,45 @@
 name: 'Build and Test with CUDA'
 description: 'Build and test MLX with CUDA'
 inputs:
  nvcc-location:
    description: 'Location of nvcc compiler'
    required: true
    default: '/usr/local/cuda-12.9/bin/nvcc'
 runs:
  using: "composite"
  steps:
    - name: Install Python package
      shell: bash
      env:
        DEBUG: 1
        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
      run: pip install -e ".[dev]" -v
    - name: Run Python tests - CPU
      shell: bash
      env:
        LOW_MEMORY: 1
        DEVICE: cpu
      run: python -m unittest discover python/tests -v
    - name: Run Python tests - GPU
      shell: bash
      env:
        LOW_MEMORY: 1
        DEVICE: gpu
      run: python -m tests discover python/tests -v
    - name: Build CPP only
      shell: bash
      run: |
        cmake . -B build \
          -DMLX_BUILD_CUDA=ON \
          -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }} \
          -DCMAKE_BUILD_TYPE=DEBUG
        cmake --build build -j $(nproc)
    - name: Run CPP tests
      shell: bash
      run: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
--- a/.github/actions/build-docs/action.yml
+++ b/.github/actions/build-docs/action.yml
@@ -1,19 +1,25 @@
 name: 'Build Documentation'
-description: 'Build documentation'
+description: 'Build documentation on a mac'
 runs:
  using: "composite"
  steps:
    - name: Setup machine
-      uses: ./.github/actions/setup-linux
+      uses: ./.github/actions/setup-macos
    - name: Setup uv
      uses: astral-sh/setup-uv@v6
      with:
          python-version: "3.10"
          activate-environment: true
    - name: Install dependencies
-      shell: bash
+      shell: sh
      run: |
-        sudo apt-get install -y doxygen
+        brew install doxygen
-        source .venv/bin/activate
+        uv pip install --upgrade pip cmake
-        pip install -r docs/requirements.txt
+        uv pip install -r docs/requirements.txt
-        pip install . -v
+        uv pip install . -v
    - name: Build documentation
      shell: bash
@@ -24,8 +30,8 @@ runs:
        make html O=-W
    - name: Create artifact tar
-      shell: bash
+      shell: sh
-      run: tar -cf artifact.tar -C docs --dereference build/html index.html
+      run: tar -cf artifact.tar --cd docs --dereference build/html index.html
    # Do it manually because upload-pages-artifact requires gtar
    - name: Upload artifact
--- a/.github/actions/build-linux/action.yml
+++ b/.github/actions/build-linux/action.yml
@@ -1,32 +1,15 @@
 name: 'Build and Test on Linux'
-
+description: 'Build and test MLX on Linux'
 inputs:
  toolkit:
    description: 'The toolkit to build with'
    required: false
    default: 'cpu'
 runs:
  using: "composite"
  steps:
    - name: Install Python package
      id: python_build
      shell: sh
      env:
        CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
        DEBUG: 1
-        CMAKE_ARGS: >-
+      run: pip install -e ".[dev]" -v
          -DCMAKE_COMPILE_WARNING_AS_ERROR=ON
          -DMLX_BUILD_CUDA=${{ startsWith(inputs.toolkit, 'cuda') && 'ON' || 'OFF' }}
      run: |
        if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then
          # There is no GPU in arm64 runner, use a common arch.
          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=90a"
          # Can not build tests when the built executables can not run.
          CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF"
        fi
        pip install --no-build-isolation -e ".[dev]" -v
        # Pass the CMAKE_ARGS to following steps.
        echo CMAKE_ARGS="$CMAKE_ARGS" >> $GITHUB_OUTPUT
    - name: Generate package stubs
      shell: sh
@@ -34,8 +17,25 @@ runs:
        pip install typing_extensions
        python setup.py generate_stubs
    - name: Run Python tests
      shell: bash
      run: |
        python -m unittest discover python/tests -v
        mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
        if grep -Fq '[WARN]' stderr.log ; then
          grep -F '[WARN]' stderr.log
          echo "Distributed ring test failed";
          exit 1;
        fi
    - name: Build CPP only
      shell: bash
      run: |
-        cmake . -B build -DCMAKE_BUILD_TYPE=Debug ${{ steps.python_build.outputs.CMAKE_ARGS }}
+        mkdir -p build && cd build
-        cmake --build build -j $(nproc)
+        cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
        make -j $(nproc)
    - name: Run CPP tests
      shell: sh
      run: ./build/tests/tests
--- a/.github/actions/build-macos-release/action.yml
+++ b/.github/actions/build-macos-release/action.yml
@@ -17,8 +17,6 @@ runs:
  steps:
    - name: Build Python package
      shell: bash -l {0}
      env:
        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
      run: |
        pip install build
        python setup.py clean --all
@@ -27,8 +25,6 @@ runs:
    - name: Build backend package
      if: ${{ inputs.build-backend }}
      shell: bash -l {0}
      env:
        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
      run: |
        python setup.py clean --all
        MLX_BUILD_STAGE=2 python -m build -w
--- a/.github/actions/build-macos/action.yml
+++ b/.github/actions/build-macos/action.yml
@@ -11,7 +11,7 @@ runs:
      shell: bash -l {0}
      run: |
        pip install --upgrade pip
-        pip install cmake setuptools nanobind==2.10.2
+        pip install cmake setuptools nanobind==2.4.0
        pip install -e . -v
    - name: Generate package stubs
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -2,82 +2,72 @@ name: 'Setup Linux Environment'
 description: 'Install dependencies for Linux builds'
 inputs:
-  toolkit:
+  runner-type:
-    description: 'Which toolkit to install'
+    description: 'Whether to set this up as a linux or CUDA runner'
    required: false
-    default: 'cpu'
+    default: 'linux'
    type: choice
    options:
      - linux
      - cuda
  python-version:
    description: 'Version of python to set up'
    required: false
    default: '3.10'
  use-ccache:
    description: 'Whether to enable ccache'
    required: false
    default: 'true'
 runs:
  using: "composite"
  steps:
    - name: Free disk space
      shell: sh
      if: inputs.runner-type == 'linux'
      run: sudo rm -rf "$AGENT_TOOLSDIRECTORY"
    - name: Install common dependencies
      env:
        TZ: Etc/UTC
      shell: bash
      run: |
        sudo apt-get update
-        sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev zip
+        sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev tzdata zip
-
+        sudo apt autoremove -y
    - name: Use ccache
      if: ${{ inputs.use-ccache == 'true' }}
      uses: hendrikmuhs/ccache-action@v1.2
      with:
        key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}
        max-size: 1GB
        # ccache-action bug: running "apt-get update" fails on large arm runner.
        update-package-index: false
    - uses: actions/setup-python@v6
      with:
        python-version: ${{ inputs.python-version }}
        cache: 'pip'
-    - name: Setup Python venv
+    - name: setup python venv
      shell: bash
      run: |
        python -m venv .venv
        source .venv/bin/activate
        pip install setuptools cmake nanobind==2.10.2
        echo PATH=$PATH >> $GITHUB_ENV
-        # Make cmake search .venv for nanobind
+        pip install --upgrade pip cmake
        echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV
    - name: Install MPI
      if: inputs.runner-type == 'linux'
      shell: bash
      run: sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev
-    - name: Install CUDA toolkit
+    - name: Network CUDA installation from packages
-      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
+      id: install-cuda
-      shell: bash
+      if: inputs.runner-type == 'cuda'
      env:
-        # Note: the CI machine does not meet CUDA 13's driver requirement.
+        TZ: Etc/UTC
-        # Compatibility matrix:
+      shell: bash ## Specific to Ubuntu 22.04 & Architecture x86_64
        # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
        PACKAGES: |
          {
            "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6",
            "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-toolkit-12-9",
            "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0"
          }
      run: |
-        # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
        # Jetson specific. SBSA means Arm Server Base System Architecture.
        ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }}
        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
        sudo dpkg -i cuda-keyring_1.1-1_all.deb
        sudo apt-get update
-        sudo apt-get install -y \
+        sudo apt-get install -y libcudnn9-dev-cuda-12 libnccl2 libnccl-dev cuda-toolkit-12-9
-            libnccl2 libnccl-dev \
+      # Note: This installs CUDA 12.9, which is the latest supported by cuDNN 9.x and works with the NVidia 570 drivers
-            ${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
+      # cuda-toolkit by itself installs version 13 (+) and requires updated drives (580+), which require a reboot to function properly.
-        echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH
+      # Compatibility matrix: https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
      # This also drops `nvcc` into `/usr/local/cuda-12.9/bin/nvcc` - but it's *not* on the default PATH
-    - name: CUDA packages and driver report
+    - name: Package and Driver Report
-      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
+      if: inputs.runner-type == 'cuda'
      shell: bash
      run: |
        sudo apt-get install -y ubuntu-drivers-common dkms
--- a/.github/actions/test-linux/action.yml
+++ b/.github/actions/test-linux/action.yml
@@ -1,69 +0,0 @@
 name: 'Run Linux tests'
 inputs:
  has-gpu:
    description: 'Run GPU tests'
    required: false
    default: false
 runs:
  using: "composite"
  steps:
    - name: Run MPI tests
      shell: bash
      run: |
        echo "::group::MPI tests"
        mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
        echo "::endgroup::"
    - name: Run distributed tests
      if: ${{ inputs.has-gpu == 'false' }}
      shell: bash
      run: |
        echo "::group::Distributed tests"
        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
        if grep -Fq '[WARN]' stderr.log ; then
          grep -F '[WARN]' stderr.log
          echo "Distributed ring test failed";
          exit 1;
        fi
        echo "::endgroup::"
    - name: Run Python tests - CPU
      if: ${{ inputs.has-gpu == 'false' }}
      shell: bash
      env:
        DEVICE: cpu
      run: |
        echo "::group::Python tests - CPU"
        python -m unittest discover python/tests -v
        echo "::endgroup::"
    - name: Run Python tests - GPU
      if: ${{ inputs.has-gpu == 'true' }}
      shell: bash
      env:
        DEVICE: gpu
      run: |
        echo "::group::Python tests - GPU"
        python -m tests discover python/tests -v
        echo "::endgroup::"
    - name: Run CPP tests - CPU
      shell: bash
      env:
        DEVICE: cpu
      run: |
        echo "::group::CPP tests - CPU"
        ./build/tests/tests
        echo "::endgroup::"
    - name: Run CPP tests - GPU
      if: ${{ inputs.has-gpu == 'true' }}
      shell: bash
      env:
        DEVICE: gpu
      run: |
        echo "::group::CPP tests - GPU"
        ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
        echo "::endgroup::"
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -8,9 +8,9 @@ permissions:
 jobs:
  build:
-    runs-on: ubuntu-22.04
+    runs-on: [self-hosted, macos]
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/build-docs
  deploy:
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -16,21 +16,21 @@ jobs:
        python_version: ["3.10", "3.14"]
    runs-on: ubuntu-22.04
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
      - uses: ./.github/actions/build-linux-release
        with:
          build-backend: ${{ matrix.python-version == '3.10' }}
          arch: "x86_64"
      - name: Upload mlx artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v5
        with:
          name: linux-wheels-${{ matrix.python_version }}
          path: wheelhouse/mlx-*.whl
          retention-days: 7
      - name: Upload mlx-cpu artifacts
        if: matrix.python_version == '3.10'
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v5
        with:
          name: mlx-cpu
          path: wheelhouse/mlx_cpu-*.whl
@@ -40,18 +40,17 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.11", "3.12", "3.13", "3.14"]
+        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
        runner:
          - ubuntu-22.04
          - ubuntu-22.04-arm
    runs-on: ${{ matrix.runner }}
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          python-version: ${{ matrix.python_version }}
      - uses: ./.github/actions/build-linux
      - uses: ./.github/actions/test-linux
  build_mac_release:
    if: github.repository == 'ml-explore/mlx'
@@ -60,7 +59,7 @@ jobs:
        python-version: ["3.10", "3.13"]
    runs-on: [self-hosted, macos]
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-macos
        with:
          python-version: ${{ matrix.python-version }}
@@ -76,21 +75,53 @@ jobs:
          macos-target: 14.0
          build-backend: ${{ matrix.python-version == '3.10' }}
  build_cuda_with_tests:
    if: github.repository == 'ml-explore/mlx'
    runs-on: gpu-t4-4-core
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          runner-type: 'cuda'
      - uses: ./.github/actions/build-cuda
  build_cuda_release:
    if: github.repository == 'ml-explore/mlx'
    runs-on: ubuntu-22-large
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
-          toolkit: 'cuda-12.9'
+          runner-type: 'cuda'
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
-          toolkit: 'cuda-12.9'
+          nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v5
        with:
          name: mlx-cuda
          path: wheelhouse/mlx_cuda-*.whl
          retention-days: 7
  linux_fedora_build_cpp:
    name: Linux Fedora CPP Build (${{ matrix.arch }})
    strategy:
      fail-fast: false
      matrix:
        include:
          - host: ubuntu-22.04
            arch: x86_64
          - host: ubuntu-22.04-arm
            arch: aarch64
    runs-on: ${{ matrix.host }}
    container:
      image: fedora:42
    steps:
      - name: Checkout code
        uses: actions/checkout@v5
      - name: CPP Build Test - No Release
        run: |
          bash ./.github/scripts/setup+build-cpp-linux-fedora-container.sh
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,67 +1,36 @@
 name: Build and Test
-on:
+on: pull_request  
  pull_request:
  push:
    branches:
      - main
      # For testing CI without starting a pull request:
      - test/*
 permissions:
  contents: read
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+  cancel-in-progress: true
 jobs:
  check_lint:
    name: Check Lint
    runs-on: ubuntu-22.04
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
      - uses: pre-commit/action@v3.0.1
  linux_build_and_test:
    name: Linux (cpu, ${{ matrix.arch }})
    needs: check_lint
    strategy:
      fail-fast: false
      matrix:
-        arch: ['x86_64', 'aarch64']
+        runner:
-    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
+          - ubuntu-22.04
          - ubuntu-22.04-arm
      fail-fast: false
    runs-on: ${{ matrix.runner }}
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
      - uses: ./.github/actions/build-linux
      - uses: ./.github/actions/test-linux
  cuda_build_and_test:
    name: Linux (${{ matrix.toolkit }}, ${{ matrix.arch }})
    if: github.repository == 'ml-explore/mlx'
    needs: check_lint
    strategy:
      fail-fast: false
      matrix:
        arch: ['x86_64', 'aarch64']
        toolkit: ['cuda-12.6', 'cuda-12.9']
    runs-on: ${{ matrix.arch == 'x86_64' && 'gpu-t4-4-core' || 'ubuntu-22.04-arm' }}
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
        with:
          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/build-linux
        with:
          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/test-linux
        if: matrix.arch == 'x86_64'
        with:
          has-gpu: true
  mac_build_and_test:
    name: macOS (${{ matrix.macos-target }})
    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
@@ -71,22 +40,31 @@ jobs:
      MACOSX_DEPLOYMENT_TARGET: ${{ matrix.macos-target }}
    needs: check_lint
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-macos
      - uses: ./.github/actions/build-macos
-  build_documentation:
+  cuda_build_and_test:
    name: Build Documentation
    if: github.repository == 'ml-explore/mlx'
-    runs-on: ubuntu-22.04
+    runs-on: gpu-t4-4-core
    needs: check_lint
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          runner-type: 'cuda'
      - uses: ./.github/actions/build-cuda
  build_documentation:
    if: github.repository == 'ml-explore/mlx'
    runs-on: [self-hosted, macos]
    needs: check_lint
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/build-docs
  linux_fedora_build_cpp:
-    name: Linux Fedora (${{ matrix.arch }})
+    name: Linux Fedora CPP Build (${{ matrix.arch }})
    needs: check_lint
    strategy:
      fail-fast: false
      matrix:
@@ -101,7 +79,7 @@ jobs:
      image: fedora:42
    steps:
      - name: Checkout code
-        uses: actions/checkout@v6
+        uses: actions/checkout@v5
      - name: CPP Build Test - No Release
        run: |
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -23,9 +23,9 @@ jobs:
  build_documentation:
    if: github.repository == 'ml-explore/mlx'
-    runs-on: ubuntu-22.04
+    runs-on: [self-hosted, macos]
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/build-docs
  deploy_documentation:
@@ -48,32 +48,29 @@ jobs:
      matrix:
        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
        arch: ['x86_64', 'aarch64']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
+    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-24.04' || 'ubuntu-24.04-arm' }}
    env:
      PYPI_RELEASE: 1
      DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          python-version: ${{ matrix.python_version }}
          use-ccache: false
      - uses: ./.github/actions/build-linux-release
        with:
          build-backend: ${{ matrix.python-version == '3.10' }}
          arch: ${{ matrix.arch }}
      - name: Upload MLX artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v5
        with:
-          overwrite: true
+          name: linux-wheels-${{ matrix.python_version }}
          name: linux-wheels-${{ matrix.python_version }}-${{ matrix.arch }}
          path: wheelhouse/mlx-*.whl
      - name: Upload CPU artifacts
        if: matrix.python_version == '3.10'
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v5
        with:
-          overwrite: true
+          name: mlx-cpu
          name: mlx-cpu-${{ matrix.arch }}
          path: wheelhouse/mlx_cpu-*.whl
  build_mac_release:
@@ -87,7 +84,7 @@ jobs:
      DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-macos
        with:
          python-version: ${{ matrix.python-version }}
@@ -96,7 +93,7 @@ jobs:
        shell: bash -l {0}
        run: |
          pip install --upgrade pip
-          pip install cmake setuptools nanobind==2.10.2
+          pip install cmake setuptools nanobind==2.4.0
          pip install -e . -v
      - name: Generate package stubs
        shell: bash -l {0}
@@ -114,43 +111,35 @@ jobs:
          macos-target: 15.0
          build-backend: ${{ matrix.python-version == '3.10' }}
      - name: Upload MLX artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v5
        with:
          overwrite: true
          name: mac-wheels-${{ matrix.python-version }}
          path: dist/mlx-*.whl
      - name: Upload Metal artifacts
        if: matrix.python-version == '3.10'
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v5
        with:
          overwrite: true
          name: mlx-metal
          path: dist/mlx_metal-*.whl
  build_cuda_release:
    if: github.repository == 'ml-explore/mlx'
-    strategy:
+    runs-on: ubuntu-22-large
      matrix:
        arch: ['x86_64', 'aarch64']
        toolkit: ['cuda-12.9', 'cuda-13.0']
    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22-large' || 'ubuntu-22-large-arm' }}
    env:
      PYPI_RELEASE: 1
      DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
-          toolkit: ${{ matrix.toolkit }}
+          runner-type: 'cuda'
          use-ccache: false
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
-          arch: ${{ matrix.arch }}
+          nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v5
        with:
          overwrite: true
          name: mlx-cuda
          path: wheelhouse/mlx_cuda-*.whl
@@ -164,12 +153,12 @@ jobs:
      name: pypi
      url: https://pypi.org/p/mlx
    steps:
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v6
        with:
          pattern: linux-wheels-*
          merge-multiple: true
          path: dist
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v6
        with:
          pattern: mac-wheels-*
          merge-multiple: true
@@ -191,7 +180,7 @@ jobs:
      name: pypi
      url: https://pypi.org/p/mlx-cuda
    steps:
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v6
        with:
          name: mlx-cuda
          path: dist
@@ -212,10 +201,9 @@ jobs:
      name: pypi
      url: https://pypi.org/p/mlx-cpu
    steps:
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v6
        with:
-          pattern: mlx-cpu-*
+          name: mlx-cpu
          merge-multiple: true
          path: dist
      - name: Display structure of downloaded files
        run: ls -R dist
@@ -234,7 +222,7 @@ jobs:
      name: pypi
      url: https://pypi.org/p/mlx-metal
    steps:
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v6
        with:
          name: mlx-metal
          path: dist
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,7 +74,6 @@ endif()
 if(MLX_USE_CCACHE)
  find_program(CCACHE_PROGRAM ccache)
  if(CCACHE_PROGRAM)
    message(STATUS "Found CCache: ${CCACHE_PROGRAM}")
    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
@@ -273,7 +272,7 @@ target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:fmt::fmt-header-only>)
 if(MLX_BUILD_PYTHON_BINDINGS)
  message(STATUS "Building Python bindings.")
  find_package(
-    Python 3.10
+    Python 3.8
    COMPONENTS Interpreter Development.Module
    REQUIRED)
  execute_process(
--- a/benchmarks/python/blas/bench_gemv.py
+++ b/benchmarks/python/blas/bench_gemv.py
@@ -1,5 +1,6 @@
 # Copyright © 2023 Apple Inc.
 import argparse
 import os
 import subprocess
 import time
--- a/benchmarks/python/masked_scatter.py
+++ b/benchmarks/python/masked_scatter.py
@@ -1,212 +0,0 @@
 import math
 import os
 import subprocess
 import time
 from copy import copy
 from functools import partial
 import matplotlib.pyplot as plt
 import mlx.core as mx
 import numpy as np
 import torch
 from matplotlib.ticker import FuncFormatter
 RESULTS_DIR = "./results"
 if not os.path.isdir(RESULTS_DIR):
    os.mkdir(RESULTS_DIR)
 DEVICE_NAME = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
 DEVICE_NAME = DEVICE_NAME.decode("utf-8").strip("\n")
 TORCH_DEVICE = torch.device(
    "mps"
    if torch.backends.mps.is_available()
    else ("cuda" if torch.cuda.is_available() else "cpu")
 )
 N_WARMUP = 5
 N_ITER_BENCH = 50
 N_ITER_FUNC = 20
 VECTOR_LENGTHS = [4096 * (2**i) for i in range(10)]
 MASK_DENSITIES = [0.01, 0.1, 0.25, 0.5]
 D_TYPES = ("float32", "float16")
 def _power_of_two_formatter(value, _position):
    if value <= 0:
        return ""
    exponent = int(round(math.log2(value)))
    if abs(value - (1 << exponent)) / value > 1e-6:
        return f"{value:g}"
    return f"$2^{{{exponent}}}$"
 def torch_sync():
    if TORCH_DEVICE.type == "cuda":
        torch.cuda.synchronize()
    elif TORCH_DEVICE.type == "mps":
        torch.mps.synchronize()
 def masked_scatter_mlx(self_arr, mask_arr, src_arr):
    outs = []
    for _ in range(N_ITER_FUNC):
        out = copy(self_arr)
        out[mask_arr] = src_arr
        outs.append(out)
    mx.eval(outs)
    return outs
@torch.no_grad()
 def masked_scatter_torch(self_tensor, mask_tensor, src_tensor):
    outs = []
    for _ in range(N_ITER_FUNC):
        out = self_tensor.clone()
        out.masked_scatter_(mask_tensor, src_tensor)
        outs.append(out)
    torch_sync()
    return outs
 def measure(fn):
    for _ in range(N_WARMUP):
        fn()
    start = time.perf_counter_ns()
    for _ in range(N_ITER_BENCH):
        fn()
    end = time.perf_counter_ns()
    return (end - start) * 1e-9
 def bytes_touched(length, true_count, item_size):
    mask_bytes = length
    self_bytes = length * item_size * 2  # read + write
    src_bytes = true_count * item_size
    return (mask_bytes + self_bytes + src_bytes) * N_ITER_FUNC * N_ITER_BENCH
 def build_case(length, density, np_dtype, torch_dtype):
    true_count = max(1, int(round(length * density)))
    rng = np.random.default_rng()
    self_np = rng.normal(0.0, 1.0, length).astype(np_dtype)
    mask_np = np.zeros(length, dtype=bool)
    mask_np[:true_count] = True
    rng.shuffle(mask_np)
    src_np = rng.normal(0.0, 1.0, true_count).astype(np_dtype)
    self_mlx = mx.array(self_np)
    mask_mlx = mx.array(mask_np)
    src_mlx = mx.array(src_np)
    self_torch = torch.from_numpy(self_np).to(device=TORCH_DEVICE, dtype=torch_dtype)
    mask_torch = torch.from_numpy(mask_np).to(device=TORCH_DEVICE)
    src_torch = torch.from_numpy(src_np).to(device=TORCH_DEVICE, dtype=torch_dtype)
    # Correctness check once per configuration
    mx_out = mx.array(self_np)
    mx_out[mask_mlx] = src_mlx
    mx.eval(mx_out)
    torch_out = self_torch.clone()
    torch_out.masked_scatter_(mask_torch, src_torch)
    atol = 5e-3 if np_dtype == np.float16 else 1e-5
    if not np.allclose(np.array(mx_out), torch_out.cpu().numpy(), atol=atol):
        raise AssertionError("masked_scatter results diverged between MLX and Torch")
    return (self_mlx, mask_mlx, src_mlx, self_torch, mask_torch, src_torch, true_count)
 def bench_case(length, density, dtype):
    np_dtype = getattr(np, dtype)
    torch_dtype = getattr(torch, dtype)
    (
        self_mlx,
        mask_mlx,
        src_mlx,
        self_torch,
        mask_torch,
        src_torch,
        true_count,
    ) = build_case(length, density, np_dtype, torch_dtype)
    time_mlx = measure(partial(masked_scatter_mlx, self_mlx, mask_mlx, src_mlx))
    time_torch = measure(
        partial(masked_scatter_torch, self_torch, mask_torch, src_torch)
    )
    total_bytes = bytes_touched(length, true_count, np_dtype().itemsize)
    bytes_per_gb = float(1024**3)
    mlx_gbps = (total_bytes / bytes_per_gb) / time_mlx
    torch_gbps = (total_bytes / bytes_per_gb) / time_torch
    return time_mlx, time_torch, mlx_gbps, torch_gbps
 def plot_density(ax_perf, ax_speedup, density, dtype):
    mlx_gbps = []
    torch_gbps = []
    mlx_times = []
    torch_times = []
    for length in VECTOR_LENGTHS:
        t_mlx, t_torch, gbps_mlx, gbps_torch = bench_case(length, density, dtype)
        mlx_gbps.append(gbps_mlx)
        torch_gbps.append(gbps_torch)
        mlx_times.append(t_mlx)
        torch_times.append(t_torch)
    ax_perf.plot(VECTOR_LENGTHS, mlx_gbps, "tab:blue", label="MLX")
    ax_perf.plot(VECTOR_LENGTHS, torch_gbps, "tab:red", label="Torch")
    ax_perf.set_xscale("log", base=2)
    ax_perf.set_xticks(VECTOR_LENGTHS)
    formatter = FuncFormatter(_power_of_two_formatter)
    ax_perf.xaxis.set_major_formatter(formatter)
    ax_perf.set_title(f"density={density:.2f}")
    ax_perf.set_ylabel("GB/s")
    ax_perf.grid(True, which="both", linestyle=":", alpha=0.4)
    ax_perf.legend()
    speedup = np.array(torch_times) / np.array(mlx_times)
    ax_speedup.plot(VECTOR_LENGTHS, speedup, "tab:green")
    ax_speedup.axhline(1.0, color="tab:gray", linestyle="--")
    ax_speedup.set_xscale("log", base=2)
    ax_speedup.set_xticks(VECTOR_LENGTHS)
    ax_speedup.xaxis.set_major_formatter(formatter)
    ax_speedup.set_ylabel("Speedup (Torch_t / MLX_t)")
    ax_speedup.grid(True, which="both", linestyle=":", alpha=0.4)
 def main():
    for dtype in D_TYPES:
        fig, axs = plt.subplots(
            len(MASK_DENSITIES),
            2,
            figsize=(10, 12),
            layout="constrained",
            sharex=True,
        )
        for i, density in enumerate(MASK_DENSITIES):
            plot_density(axs[i][0], axs[i][1], density, dtype)
            axs[i][0].set_xlabel("vector length")
            axs[i][1].set_xlabel("vector length")
        fig.suptitle(
            f"{DEVICE_NAME.replace('Apple ', '')} ({TORCH_DEVICE.type}) | dtype={dtype}"
        )
        output_path = os.path.join(
            RESULTS_DIR,
            f"{DEVICE_NAME.replace(' ', '_')}_masked_scatter_{dtype}.pdf",
        )
        fig.savefig(output_path)
        plt.close(fig)
 if __name__ == "__main__":
    main()
--- a/cmake/Findnvpl.cmake
+++ b/cmake/Findnvpl.cmake
@@ -1,3 +0,0 @@
 # This file does nothing but to suppress the cmake warning: "By not providing
 # Findnvpl.cmake in CMAKE_MODULE_PATH...", which is caused by the
 # find_package(nvpl) from cmake's builtin FindLAPACK.cmake module.
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -29,20 +29,17 @@ MLX has a CUDA backend which you can install with:
 .. code-block:: shell
-    pip install mlx[cuda12]
+    pip install mlx[cuda]
 To install the CUDA package from PyPi your system must meet the following
 requirements:
- Nvidia architecture >= SM 7.5
+- Nvidia architecture >= SM 7.0 (Volta)
 - Nvidia driver >= 550.54.14
 - CUDA toolkit >= 12.0
 - Linux distribution with glibc >= 2.35
 - Python >= 3.10
 For CUDA 13 use ``pip install mlx[cuda13]``. The CUDA 13 package requires
 an Nvidia driver >= 580 or an appropriate CUDA compatibility package.
 CPU-only (Linux)
 ^^^^^^^^^^^^^^^^
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -70,8 +70,7 @@ Differences from NumPy
  * Indexing does not perform bounds checking. Indexing out of bounds is
    undefined behavior.
-  * Boolean mask based indexing is supported for assignment only (see
+  * Boolean mask based indexing is not yet supported.
    :ref:`boolean-mask-assignment`).
 The reason for the lack of bounds checking is that exceptions cannot propagate
 from the GPU. Performing bounds checking for array indices before launching the
@@ -144,51 +143,3 @@ expected. For example:
 In the above ``dfdx`` will have the correct gradient, namely zeros at ``idx``
 and ones elsewhere.
 .. _boolean-mask-assignment:
 Boolean Mask Assignment
 -----------------------
 MLX supports boolean indices using NumPy syntax. A mask must already be
 a :class:`bool_` MLX :class:`array` or a NumPy ``ndarray`` with ``dtype=bool``.
 Other index types are routed through the standard scatter code.
 .. code-block:: shell
   >>> a = mx.array([1.0, 2.0, 3.0])
   >>> mask = mx.array([True, False, True])
   >>> updates = mx.array([5.0, 6.0])
   >>> a[mask] = updates
   >>> a
   array([5.0, 2.0, 6.0], dtype=float32)
 Scalar assignments broadcast to every ``True`` entry in ``mask``. For non-scalar
 assignments, ``updates`` must provide at least as many elements as there are
 ``True`` entries in ``mask``.
 .. code-block:: shell
   >>> a = mx.zeros((2, 3))
   >>> mask = mx.array([[True, False, True],
                        [False, False, True]])
   >>> a[mask] = 1.0
   >>> a
   array([[1.0, 0.0, 1.0],
          [0.0, 0.0, 1.0]], dtype=float32)
 Boolean masks follow NumPy semantics:
 - The mask shape must match the shape of the axes it indexes exactly. The only
  exception is a scalar boolean mask, which broadcasts to the full array.
 - Any axes not covered by the mask are taken in full.
 .. code-block:: shell
   >>> a = mx.arange(1000).reshape(10, 10, 10)
   >>> a[mx.random.normal((10, 10)) > 0.0] = 0  # valid: mask covers axes 0 and 1
 The mask of shape ``(10, 10)`` applies to the first two axes, so ``a[mask]``
 selects the 1-D slices ``a[i, j, :]`` where ``mask[i, j]`` is ``True``.
 Shapes such as ``(1, 10, 10)`` or ``(10, 10, 1)`` do not match the indexed
 axes and therefore raise errors.
--- a/examples/extensions/pyproject.toml
+++ b/examples/extensions/pyproject.toml
@@ -3,6 +3,6 @@ requires = [
  "setuptools>=42",
  "cmake>=3.25",
  "mlx>=0.18.0",
-  "nanobind==2.10.2",
+  "nanobind==2.4.0",
 ]
 build-backend = "setuptools.build_meta"
--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
 cmake>=3.25
 mlx>=0.21.0
-nanobind==2.10.2
+nanobind==2.4.0
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -1,6 +1,7 @@
 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
--- a/mlx/allocator.cpp
+++ b/mlx/allocator.cpp
@@ -0,0 +1,24 @@
 // Copyright © 2023 Apple Inc.
 #include <cstdlib>
 #include <sstream>
 #include "mlx/allocator.h"
 namespace mlx::core::allocator {
 Buffer malloc(size_t size) {
  auto buffer = allocator().malloc(size);
  if (size && !buffer.ptr()) {
    std::ostringstream msg;
    msg << "[malloc] Unable to allocate " << size << " bytes.";
    throw std::runtime_error(msg.str());
  }
  return buffer;
 }
 void free(Buffer buffer) {
  allocator().free(buffer);
 }
 } // namespace mlx::core::allocator
--- a/mlx/allocator.h
+++ b/mlx/allocator.h
@@ -28,16 +28,16 @@ class Buffer {
  };
 };
 Buffer malloc(size_t size);
 void free(Buffer buffer);
 class Allocator {
  /** Abstract base class for a memory allocator. */
 public:
  virtual Buffer malloc(size_t size) = 0;
  virtual void free(Buffer buffer) = 0;
  virtual size_t size(Buffer buffer) const = 0;
  virtual Buffer make_buffer(void* ptr, size_t size) {
    return Buffer{nullptr};
  };
  virtual void release(Buffer buffer) {}
  Allocator() = default;
  Allocator(const Allocator& other) = delete;
@@ -49,25 +49,4 @@ class Allocator {
 Allocator& allocator();
 inline Buffer malloc(size_t size) {
  return allocator().malloc(size);
 }
 inline void free(Buffer buffer) {
  allocator().free(buffer);
 }
 // Make a Buffer from a raw pointer of the given size without a copy.  If a
 // no-copy conversion is not possible then the returned buffer.ptr() will be
 // nullptr. Any buffer created with this function must be released with
 // release(buffer)
 inline Buffer make_buffer(void* ptr, size_t size) {
  return allocator().make_buffer(ptr, size);
 };
 // Release a buffer from the allocator made with make_buffer
 inline void release(Buffer buffer) {
  allocator().release(buffer);
 }
 } // namespace mlx::core::allocator
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -82,28 +82,6 @@ array::array(std::initializer_list<int> data, Dtype dtype)
  init(data.begin());
 }
 array::array(
    void* data,
    Shape shape,
    Dtype dtype,
    const std::function<void(void*)>& deleter)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
  auto buffer = allocator::make_buffer(data, nbytes());
  if (buffer.ptr() == nullptr) {
    set_data(allocator::malloc(nbytes()));
    auto ptr = static_cast<char*>(data);
    std::copy(ptr, ptr + nbytes(), this->data<char>());
    deleter(data);
  } else {
    auto wrapped_deleter = [deleter](allocator::Buffer buffer) {
      auto ptr = buffer.ptr();
      allocator::release(buffer);
      return deleter(ptr);
    };
    set_data(buffer, std::move(wrapped_deleter));
  }
 }
 /* Build an array from a shared buffer */
 array::array(allocator::Buffer data, Shape shape, Dtype dtype, Deleter deleter)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -57,16 +57,6 @@ class array {
      Shape shape,
      Dtype dtype = TypeToDtype<T>());
  /* Build an array from a raw pointer. The constructor will attempt to use the
   * input data without a copy. The deleter will be called when the array no
   * longer needs the underlying memory - after the array is destroyed in the
   * no-copy case and after the copy otherwise. */
  explicit array(
      void* data,
      Shape shape,
      Dtype dtype,
      const std::function<void(void*)>& deleter);
  /* Build an array from a buffer */
  explicit array(
      allocator::Buffer data,
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -130,7 +130,7 @@ void compiled_allocate_outputs(
      // - Donatable
      // - Not a constant
      if (in.itemsize() == outputs[o].itemsize() && !is_scalar(in) &&
-          in.is_donatable() && !is_constant(i)) {
+          in.is_donatable() && is_constant(i)) {
        outputs[o++].copy_shared_buffer(in);
      }
      // Get representative input flags to properly set non-donated outputs
@@ -158,7 +158,7 @@ void compiled_allocate_outputs(
      // - Not a constant
      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
-          !is_constant(i)) {
+          is_constant(i)) {
        outputs[o].copy_shared_buffer(
            in, outputs[o].strides(), in.flags(), in.data_size());
        o++;
--- a/mlx/backend/cpu/eig.cpp
+++ b/mlx/backend/cpu/eig.cpp
@@ -12,167 +12,6 @@ namespace mlx::core {
 namespace {
 template <typename T>
 complex64_t to_complex(T r, T i) {
  return {static_cast<float>(r), static_cast<float>(i)};
 }
 template <typename T, class Enable = void>
 struct EigWork {};
 template <typename T>
 struct EigWork<
    T,
    typename std::enable_if<std::is_floating_point<T>::value>::type> {
  using O = complex64_t;
  char jobl;
  char jobr;
  int N;
  int lwork;
  int info;
  std::vector<array::Data> buffers;
  EigWork(char jobl_, char jobr_, int N_, bool compute_eigenvectors)
      : jobl(jobl_), jobr(jobr_), N(N_), lwork(-1) {
    T work;
    int n_vecs_l = compute_eigenvectors ? N_ : 1;
    int n_vecs_r = 1;
    geev<T>(
        &jobl,
        &jobr,
        &N,
        nullptr,
        &N,
        nullptr,
        nullptr,
        nullptr,
        &n_vecs_l,
        nullptr,
        &n_vecs_r,
        &work,
        &lwork,
        &info);
    lwork = static_cast<int>(work);
    buffers.emplace_back(allocator::malloc(sizeof(T) * N * 2));
    if (compute_eigenvectors) {
      buffers.emplace_back(allocator::malloc(sizeof(T) * N * N * 2));
    }
    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
  }
  void run(T* a, O* values, O* vectors) {
    auto eig_tmp = static_cast<T*>(buffers[0].buffer.raw_ptr());
    T* vec_tmp = nullptr;
    if (vectors) {
      vec_tmp = static_cast<T*>(buffers[1].buffer.raw_ptr());
    }
    auto work = static_cast<T*>(buffers.back().buffer.raw_ptr());
    int n_vecs_l = vectors ? N : 1;
    int n_vecs_r = 1;
    geev<T>(
        &jobl,
        &jobr,
        &N,
        a,
        &N,
        eig_tmp,
        eig_tmp + N,
        vectors ? vec_tmp : nullptr,
        &n_vecs_l,
        nullptr,
        &n_vecs_r,
        work,
        &lwork,
        &info);
    for (int i = 0; i < N; ++i) {
      values[i] = to_complex(eig_tmp[i], eig_tmp[N + i]);
    }
    if (vectors) {
      for (int i = 0; i < N; ++i) {
        if (values[i].imag() != 0) {
          for (int j = 0; j < N; ++j) {
            vectors[i * N + j] =
                to_complex(vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]);
            vectors[(i + 1) * N + j] =
                to_complex(vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]);
          }
          i += 1;
        } else {
          for (int j = 0; j < N; ++j) {
            vectors[i * N + j] = to_complex(vec_tmp[i * N + j], T(0.0));
          }
        }
      }
    }
  }
 };
 template <>
 struct EigWork<std::complex<float>> {
  using T = std::complex<float>;
  using R = float;
  using O = T;
  char jobl;
  char jobr;
  int N;
  int lwork;
  int lrwork;
  int info;
  std::vector<array::Data> buffers;
  EigWork(char jobl_, char jobr_, int N_, bool compute_eigenvectors)
      : jobl(jobl_), jobr(jobr_), N(N_), lwork(-1), lrwork(2 * N_) {
    T work;
    R rwork;
    int n_vecs_l = compute_eigenvectors ? N_ : 1;
    int n_vecs_r = 1;
    geev<T>(
        &jobl,
        &jobr,
        &N,
        nullptr,
        &N,
        nullptr,
        nullptr,
        &n_vecs_l,
        nullptr,
        &n_vecs_r,
        &work,
        &lwork,
        &rwork,
        &info);
    lwork = static_cast<int>(work.real());
    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
    buffers.emplace_back(allocator::malloc(sizeof(R) * lrwork));
  }
  void run(T* a, T* values, T* vectors) {
    int n_vecs_l = vectors ? N : 1;
    int n_vecs_r = 1;
    geev<T>(
        &jobl,
        &jobr,
        &N,
        a,
        &N,
        values,
        vectors,
        &n_vecs_l,
        nullptr,
        &n_vecs_r,
        static_cast<T*>(buffers[0].buffer.raw_ptr()),
        &lwork,
        static_cast<R*>(buffers[1].buffer.raw_ptr()),
        &info);
  }
 };
 template <typename T>
 void eig_impl(
    array& a,
@@ -180,39 +19,101 @@ void eig_impl(
    array& values,
    bool compute_eigenvectors,
    Stream stream) {
  using OT = std::complex<T>;
  auto a_ptr = a.data<T>();
-  auto val_ptr = values.data<complex64_t>();
+  auto eig_ptr = values.data<OT>();
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(values);
-  complex64_t* vec_ptr = nullptr;
+  OT* vec_ptr = nullptr;
  if (compute_eigenvectors) {
    encoder.set_output_array(vectors);
-    vec_ptr = vectors.data<complex64_t>();
+    vec_ptr = vectors.data<OT>();
  }
  encoder.dispatch([a_ptr,
                    val_ptr,
                    vec_ptr,
                    eig_ptr,
                    compute_eigenvectors,
                    N = vectors.shape(-1),
                    size = vectors.size()]() mutable {
    // Work query
    char jobr = 'N';
    char jobl = compute_eigenvectors ? 'V' : 'N';
    int n_vecs_r = 1;
    int n_vecs_l = compute_eigenvectors ? N : 1;
    int lwork = -1;
    int info;
    {
      T work;
      geev<T>(
          &jobl,
          &jobr,
          &N,
          nullptr,
          &N,
          nullptr,
          nullptr,
          nullptr,
          &n_vecs_l,
          nullptr,
          &n_vecs_r,
          &work,
          &lwork,
          &info);
      lwork = static_cast<int>(work);
    }
-    EigWork<T> work(jobl, jobr, N, compute_eigenvectors);
+    auto eig_tmp_data = array::Data{allocator::malloc(sizeof(T) * N * 2)};
-
+    auto vec_tmp_data =
        array::Data{allocator::malloc(vec_ptr ? sizeof(T) * N * N * 2 : 0)};
    auto eig_tmp = static_cast<T*>(eig_tmp_data.buffer.raw_ptr());
    auto vec_tmp = static_cast<T*>(vec_tmp_data.buffer.raw_ptr());
    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
    for (size_t i = 0; i < size / (N * N); ++i) {
-      work.run(a_ptr, val_ptr, vec_ptr);
+      geev<T>(
-      a_ptr += N * N;
+          &jobl,
-      val_ptr += N;
+          &jobr,
          &N,
          a_ptr,
          &N,
          eig_tmp,
          eig_tmp + N,
          vec_tmp,
          &n_vecs_l,
          nullptr,
          &n_vecs_r,
          static_cast<T*>(work_buf.buffer.raw_ptr()),
          &lwork,
          &info);
      for (int i = 0; i < N; ++i) {
        eig_ptr[i] = {eig_tmp[i], eig_tmp[N + i]};
      }
      if (vec_ptr) {
        for (int i = 0; i < N; ++i) {
          if (eig_ptr[i].imag() != 0) {
            // This vector and the next are a pair
            for (int j = 0; j < N; ++j) {
              vec_ptr[i * N + j] = {
                  vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]};
              vec_ptr[(i + 1) * N + j] = {
                  vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]};
            }
            i += 1;
          } else {
            for (int j = 0; j < N; ++j) {
              vec_ptr[i * N + j] = {vec_tmp[i * N + j], 0};
            }
          }
        }
        vec_ptr += N * N;
      }
-      if (work.info != 0) {
+      a_ptr += N * N;
      eig_ptr += N;
      if (info != 0) {
        std::stringstream msg;
        msg << "[Eig::eval_cpu] Eigenvalue decomposition failed with error code "
-            << work.info;
+            << info;
        throw std::runtime_error(msg.str());
      }
    }
@@ -264,17 +165,8 @@ void Eig::eval_cpu(
    case float32:
      eig_impl<float>(a_copy, vectors, values, compute_eigenvectors_, stream());
      break;
    case float64:
      eig_impl<double>(
          a_copy, vectors, values, compute_eigenvectors_, stream());
      break;
    case complex64:
      eig_impl<std::complex<float>>(
          a_copy, vectors, values, compute_eigenvectors_, stream());
      break;
    default:
-      throw std::runtime_error(
+      throw std::runtime_error("[Eig::eval_cpu] only supports float32.");
          "[Eig::eval_cpu] only supports float32, float64, or complex64.");
  }
 }
--- a/mlx/backend/cpu/indexing.cpp
+++ b/mlx/backend/cpu/indexing.cpp
@@ -747,108 +747,4 @@ void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
  });
 }
 template <typename T>
 void masked_scatter_impl(const array& mask, const array& src, array& out) {
  ContiguousIterator mask_it(mask);
  ContiguousIterator src_it(src);
  ContiguousIterator out_it(out);
  const bool* mask_ptr = mask.data<bool>();
  const T* src_ptr = src.data<T>();
  T* dst_ptr = out.data<T>();
  const size_t batch_count = mask.shape(0);
  const size_t mask_batch_size = mask.size() / batch_count;
  const size_t src_batch_size = src.size() / batch_count;
  for (uint b = 0; b < batch_count; ++b) {
    size_t src_consumed = 0;
    src_it.seek(b * src_batch_size);
    for (size_t i = 0; i < mask_batch_size; ++i) {
      if (mask_ptr[mask_it.loc]) {
        if (src_consumed >= src_batch_size) {
          throw std::runtime_error(
              "[MaskedScatter::eval_cpu] Source does not have enough elements for mask.");
        }
        dst_ptr[out_it.loc] = src_ptr[src_it.loc];
        src_it.step();
        ++src_consumed;
      }
      mask_it.step();
      out_it.step();
    }
  }
 }
 void MaskedScatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 3);
  auto& dst = inputs[0];
  auto& mask = inputs[1];
  auto& src = inputs[2];
  // Copy src into out (copy allocates memory for out)
  auto ctype =
      dst.flags().row_contiguous ? CopyType::Vector : CopyType::General;
  copy_cpu(dst, out, ctype, stream());
  if (mask.size() == 0) {
    return;
  }
  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(mask);
  encoder.set_input_array(src);
  encoder.set_output_array(out);
  encoder.dispatch([mask = array::unsafe_weak_copy(mask),
                    src = array::unsafe_weak_copy(src),
                    out = array::unsafe_weak_copy(out)]() mutable {
    switch (out.dtype()) {
      case bool_:
        masked_scatter_impl<bool>(mask, src, out);
        break;
      case uint8:
        masked_scatter_impl<uint8_t>(mask, src, out);
        break;
      case uint16:
        masked_scatter_impl<uint16_t>(mask, src, out);
        break;
      case uint32:
        masked_scatter_impl<uint32_t>(mask, src, out);
        break;
      case uint64:
        masked_scatter_impl<uint64_t>(mask, src, out);
        break;
      case int8:
        masked_scatter_impl<int8_t>(mask, src, out);
        break;
      case int16:
        masked_scatter_impl<int16_t>(mask, src, out);
        break;
      case int32:
        masked_scatter_impl<int32_t>(mask, src, out);
        break;
      case int64:
        masked_scatter_impl<int64_t>(mask, src, out);
        break;
      case float16:
        masked_scatter_impl<float16_t>(mask, src, out);
        break;
      case float32:
        masked_scatter_impl<float>(mask, src, out);
        break;
      case float64:
        masked_scatter_impl<double>(mask, src, out);
        break;
      case bfloat16:
        masked_scatter_impl<bfloat16_t>(mask, src, out);
        break;
      case complex64:
        masked_scatter_impl<complex64_t>(mask, src, out);
        break;
    }
  });
 }
 } // namespace mlx::core
--- a/mlx/backend/cpu/lapack.h
+++ b/mlx/backend/cpu/lapack.h
@@ -45,7 +45,9 @@
 INSTANTIATE_LAPACK_REAL(geqrf)
 INSTANTIATE_LAPACK_REAL(orgqr)
 INSTANTIATE_LAPACK_REAL(syevd)
 INSTANTIATE_LAPACK_REAL(geev)
 INSTANTIATE_LAPACK_REAL(potrf)
 INSTANTIATE_LAPACK_REAL(gesdd)
 INSTANTIATE_LAPACK_REAL(getrf)
 INSTANTIATE_LAPACK_REAL(getri)
 INSTANTIATE_LAPACK_REAL(trtri)
@@ -61,20 +63,3 @@ INSTANTIATE_LAPACK_REAL(trtri)
  }
 INSTANTIATE_LAPACK_COMPLEX(heevd)
 #define INSTANTIATE_LAPACK_ALL(FUNC)                                \
  template <typename T, typename... Args>                           \
  void FUNC(Args... args) {                                         \
    if constexpr (std::is_same_v<T, float>) {                       \
      MLX_LAPACK_FUNC(s##FUNC)(std::forward<Args>(args)...);        \
    } else if constexpr (std::is_same_v<T, double>) {               \
      MLX_LAPACK_FUNC(d##FUNC)(std::forward<Args>(args)...);        \
    } else if constexpr (std::is_same_v<T, std::complex<float>>) {  \
      MLX_LAPACK_FUNC(c##FUNC)(std::forward<Args>(args)...);        \
    } else if constexpr (std::is_same_v<T, std::complex<double>>) { \
      MLX_LAPACK_FUNC(z##FUNC)(std::forward<Args>(args)...);        \
    }                                                               \
  }
 INSTANTIATE_LAPACK_ALL(geev)
 INSTANTIATE_LAPACK_ALL(gesdd)
--- a/mlx/backend/cpu/primitives.cpp
+++ b/mlx/backend/cpu/primitives.cpp
@@ -291,17 +291,6 @@ void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
                    num_keys,
                    kshape = keys.shape(),
                    kstrides = keys.strides()]() mutable {
    auto copy_remaining = [&](char* cptr, size_t loc, uint32_t v) {
      if (4 * loc + 4 <= bytes_per_key) {
        reinterpret_cast<uint32_t*>(cptr)[loc] = v;
      } else {
        std::copy(
            reinterpret_cast<char*>(&v),
            reinterpret_cast<char*>(&v) + bytes_per_key - 4 * loc,
            cptr + 4 * loc);
      }
    };
    size_t out_skip = (bytes_per_key + 4 - 1) / 4;
    auto half_size = out_skip / 2;
    bool even = out_skip % 2 == 0;
@@ -321,12 +310,18 @@ void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
      if (count.first < half_size) {
        auto rb = random::threefry2x32_hash(key, count);
        ptr[count.first++] = rb.first;
-        copy_remaining(cptr, count.second, rb.second);
+        if (bytes_per_key % 4 > 0) {
          std::copy(
              reinterpret_cast<char*>(&rb.second),
              reinterpret_cast<char*>(&rb.second) + bytes_per_key % 4,
              cptr + 4 * count.second);
        } else {
          ptr[count.second] = rb.second;
        }
      }
      if (!even) {
        count.second = 0;
-        copy_remaining(
+        ptr[half_size] = random::threefry2x32_hash(key, count).first;
            cptr, half_size, random::threefry2x32_hash(key, count).first);
      }
    }
  });
--- a/mlx/backend/cpu/simd/type.h
+++ b/mlx/backend/cpu/simd/type.h
@@ -3,9 +3,5 @@
 #include "mlx/backend/cpu/simd/base_simd.h"
 #ifdef MLX_USE_ACCELERATE
 #if defined(__x86_64__)
 // the accelerate_simd implementation require neon -- use base implementation
 #else
 #include "mlx/backend/cpu/simd/accelerate_simd.h"
 #endif
 #endif
--- a/mlx/backend/cpu/svd.cpp
+++ b/mlx/backend/cpu/svd.cpp
@@ -8,183 +8,6 @@
 namespace mlx::core {
 template <typename T, class Enable = void>
 struct SVDWork {};
 template <typename T>
 struct SVDWork<
    T,
    typename std::enable_if<std::is_floating_point<T>::value>::type> {
  using R = T;
  int N;
  int M;
  int K;
  int lda;
  int ldu;
  int ldvt;
  char jobz;
  std::vector<array::Data> buffers;
  int lwork;
  SVDWork(int N, int M, int K, char jobz)
      : N(N), M(M), K(K), lda(N), ldu(N), ldvt(M), jobz(jobz) {
    T workspace_dimension = 0;
    // Will contain the indices of eigenvectors that failed to converge (not
    // used here but required by lapack).
    buffers.emplace_back(allocator::malloc(sizeof(int) * 8 * K));
    int lwork_query = -1;
    int info;
    // Compute workspace size.
    gesdd<T>(
        /* jobz = */ &jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ nullptr,
        /* lda = */ &lda,
        /* s = */ nullptr,
        /* u = */ nullptr,
        /* ldu = */ &ldu,
        /* vt = */ nullptr,
        /* ldvt = */ &ldvt,
        /* work = */ &workspace_dimension,
        /* lwork = */ &lwork_query,
        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
        /* info = */ &info);
    if (info != 0) {
      std::stringstream ss;
      ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
      throw std::runtime_error(ss.str());
    }
    lwork = workspace_dimension;
    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
  }
  void run(T* a, R* s, T* u, T* vt) {
    int info;
    gesdd<T>(
        /* jobz = */ &jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ a,
        /* lda = */ &lda,
        /* s = */ s,
        // According to the identity above, lapack will write Vᵀᵀ as U.
        /* u = */ u,
        /* ldu = */ &ldu,
        // According to the identity above, lapack will write Uᵀ as Vᵀ.
        /* vt = */ vt,
        /* ldvt = */ &ldvt,
        /* work = */ static_cast<T*>(buffers[1].buffer.raw_ptr()),
        /* lwork = */ &lwork,
        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
        /* info = */ &info);
    if (info != 0) {
      std::stringstream ss;
      ss << "svd_impl: sgesvdx_ failed with code " << info;
      throw std::runtime_error(ss.str());
    }
  }
 };
 template <>
 struct SVDWork<std::complex<float>> {
  using T = std::complex<float>;
  using R = float;
  int N;
  int M;
  int K;
  int lda;
  int ldu;
  int ldvt;
  char jobz;
  std::vector<array::Data> buffers;
  int lwork;
  SVDWork(int N, int M, int K, char jobz)
      : N(N), M(M), K(K), lda(N), ldu(N), ldvt(M), jobz(jobz) {
    T workspace_dimension = 0;
    // Will contain the indices of eigenvectors that failed to converge (not
    // used here but required by lapack).
    buffers.emplace_back(allocator::malloc(sizeof(int) * 8 * K));
    const int lrwork =
        jobz == 'A' ? std::max(1, 5 * K * K + 5 * K) : std::max(1, 7 * K);
    buffers.emplace_back(allocator::malloc(sizeof(float) * lrwork));
    int lwork_query = -1;
    int work_query = -1;
    int info;
    // Compute workspace size.
    gesdd<T>(
        /* jobz = */ &jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ nullptr,
        /* lda = */ &lda,
        /* s = */ nullptr,
        /* u = */ nullptr,
        /* ldu = */ &ldu,
        /* vt = */ nullptr,
        /* ldvt = */ &ldvt,
        /* work = */ &workspace_dimension,
        /* lwork = */ &lwork_query,
        /* rwork = */ static_cast<float*>(buffers[1].buffer.raw_ptr()),
        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
        /* info = */ &info);
    if (info != 0) {
      std::stringstream ss;
      ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
      throw std::runtime_error(ss.str());
    }
    lwork = workspace_dimension.real();
    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
  }
  void run(T* a, R* s, T* u, T* vt) {
    int info;
    gesdd<T>(
        /* jobz = */ &jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ a,
        /* lda = */ &lda,
        /* s = */ s,
        // According to the identity above, lapack will write Vᵀᵀ as U.
        /* u = */ u,
        /* ldu = */ &ldu,
        // According to the identity above, lapack will write Uᵀ as Vᵀ.
        /* vt = */ vt,
        /* ldvt = */ &ldvt,
        /* work = */ static_cast<T*>(buffers[2].buffer.raw_ptr()),
        /* lwork = */ &lwork,
        /* rwork = */ static_cast<float*>(buffers[1].buffer.raw_ptr()),
        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
        /* info = */ &info);
    if (info != 0) {
      std::stringstream ss;
      ss << "svd_impl: sgesvdx_ failed with code " << info;
      throw std::runtime_error(ss.str());
    }
  }
 };
 template <typename T>
 void svd_impl(
    const array& a,
@@ -204,8 +27,6 @@ void svd_impl(
  const int N = a.shape(-1);
  const int K = std::min(M, N);
  using R = typename SVDWork<T>::R;
  size_t num_matrices = a.size() / (M * N);
  // lapack clobbers the input, so we have to make a copy.
@@ -221,7 +42,7 @@ void svd_impl(
  encoder.set_input_array(a);
  auto in_ptr = in.data<T>();
  T* u_ptr;
-  R* s_ptr;
+  T* s_ptr;
  T* vt_ptr;
  if (compute_uv) {
@@ -237,7 +58,7 @@ void svd_impl(
    encoder.set_output_array(s);
    encoder.set_output_array(vt);
-    s_ptr = s.data<R>();
+    s_ptr = s.data<T>();
    u_ptr = u.data<T>();
    vt_ptr = vt.data<T>();
  } else {
@@ -247,26 +68,96 @@ void svd_impl(
    encoder.set_output_array(s);
-    s_ptr = s.data<R>();
+    s_ptr = s.data<T>();
    u_ptr = nullptr;
    vt_ptr = nullptr;
  }
  encoder.dispatch([in_ptr, u_ptr, s_ptr, vt_ptr, M, N, K, num_matrices]() {
-    auto jobz = (u_ptr) ? 'A' : 'N';
+    // A of shape M x N. The leading dimension is N since lapack receives Aᵀ.
-    SVDWork<T> svd_work(N, M, K, jobz);
+    const int lda = N;
    // U of shape M x M. (N x N in lapack).
    const int ldu = N;
    // Vᵀ of shape N x N. (M x M in lapack).
    const int ldvt = M;
    auto jobz = (u_ptr) ? "A" : "N";
    T workspace_dimension = 0;
    // Will contain the indices of eigenvectors that failed to converge (not
    // used here but required by lapack).
    auto iwork = array::Data{allocator::malloc(sizeof(int) * 8 * K)};
    static const int lwork_query = -1;
    int info;
    // Compute workspace size.
    gesdd<T>(
        /* jobz = */ jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ nullptr,
        /* lda = */ &lda,
        /* s = */ nullptr,
        /* u = */ nullptr,
        /* ldu = */ &ldu,
        /* vt = */ nullptr,
        /* ldvt = */ &ldvt,
        /* work = */ &workspace_dimension,
        /* lwork = */ &lwork_query,
        /* iwork = */ static_cast<int*>(iwork.buffer.raw_ptr()),
        /* info = */ &info);
    if (info != 0) {
      std::stringstream ss;
      ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
      throw std::runtime_error(ss.str());
    }
    const int lwork = workspace_dimension;
    auto scratch = array::Data{allocator::malloc(sizeof(T) * lwork)};
    // Loop over matrices.
    for (int i = 0; i < num_matrices; i++) {
-      svd_work.run(
+      gesdd<T>(
-          in_ptr + M * N * i,
+          /* jobz = */ jobz,
-          s_ptr + K * i,
+          // M and N are swapped since lapack expects column-major.
-          vt_ptr ? vt_ptr + N * N * i : nullptr,
+          /* m = */ &N,
-          u_ptr ? u_ptr + M * M * i : nullptr);
+          /* n = */ &M,
          /* a = */ in_ptr + M * N * i,
          /* lda = */ &lda,
          /* s = */ s_ptr + K * i,
          // According to the identity above, lapack will write Vᵀᵀ as U.
          /* u = */ vt_ptr ? vt_ptr + N * N * i : nullptr,
          /* ldu = */ &ldu,
          // According to the identity above, lapack will write Uᵀ as Vᵀ.
          /* vt = */ u_ptr ? u_ptr + M * M * i : nullptr,
          /* ldvt = */ &ldvt,
          /* work = */ static_cast<T*>(scratch.buffer.raw_ptr()),
          /* lwork = */ &lwork,
          /* iwork = */ static_cast<int*>(iwork.buffer.raw_ptr()),
          /* info = */ &info);
      if (info != 0) {
        std::stringstream ss;
        ss << "svd_impl: sgesvdx_ failed with code " << info;
        throw std::runtime_error(ss.str());
      }
    }
  });
  encoder.add_temporary(in);
 }
 template <typename T>
 void compute_svd(
    const array& a,
    bool compute_uv,
    std::vector<array>& outputs,
    Stream stream) {}
 void SVD::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
@@ -277,12 +168,9 @@ void SVD::eval_cpu(
    case float64:
      svd_impl<double>(inputs[0], outputs, compute_uv_, stream());
      break;
    case complex64:
      svd_impl<std::complex<float>>(inputs[0], outputs, compute_uv_, stream());
      break;
    default:
      throw std::runtime_error(
-          "[SVD::eval_cpu] only supports float32, float64, or complex64.");
+          "[SVD::eval_cpu] only supports float32 or float64.");
  }
 }
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -123,21 +123,10 @@ if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
    mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--compress-mode=size>")
 endif()
-# Use native CUDA arch by default.
+# Compute capability >= 7.0 is required for synchronization between CPU/GPU with
 # managed memory.
 if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
-  execute_process(
+  set(MLX_CUDA_ARCHITECTURES "native")
    COMMAND __nvcc_device_query
    OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES
    OUTPUT_STRIP_TRAILING_WHITESPACE)
  set(UPGRADABLE_ARCHITECTURES "90;100;121")
  if(MLX_CUDA_ARCHITECTURES STREQUAL "")
    message(
      FATAL_ERROR
        "Can not get native CUDA arch, must set MLX_CUDA_ARCHITECTURES")
  elseif(MLX_CUDA_ARCHITECTURES IN_LIST UPGRADABLE_ARCHITECTURES)
    # Use arch-specific compute capability whenever possible.
    set(MLX_CUDA_ARCHITECTURES "${MLX_CUDA_ARCHITECTURES}a")
  endif()
 endif()
 message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
@@ -149,7 +138,6 @@ FetchContent_Declare(
  URL "https://github.com/NVIDIA/cccl/releases/download/v2.8.1/cccl-v2.8.1.zip")
 FetchContent_MakeAvailable(cccl)
 target_include_directories(mlx BEFORE PRIVATE "${cccl_SOURCE_DIR}/include")
 set_target_properties(mlx PROPERTIES CCCL_DIR "${cccl_SOURCE_DIR}/include")
 # Use fixed version of NVTX.
 FetchContent_Declare(
@@ -175,7 +163,7 @@ target_link_libraries(mlx PRIVATE CUDA::nvrtc CUDA::cuda_driver)
 FetchContent_Declare(
  cudnn
  GIT_REPOSITORY https://github.com/NVIDIA/cudnn-frontend.git
-  GIT_TAG v1.16.0
+  GIT_TAG v1.14.0
  GIT_SHALLOW TRUE
  EXCLUDE_FROM_ALL)
 set(CUDNN_FRONTEND_SKIP_JSON_LIB ON)
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -20,19 +20,6 @@ constexpr int page_size = 16384;
 // Any allocations smaller than this will try to use the small pool
 constexpr int small_block_size = 8;
 #if CUDART_VERSION >= 13000
 inline cudaMemLocation cuda_mem_loc(int i) {
  cudaMemLocation loc;
  loc.type = cudaMemLocationTypeDevice;
  loc.id = i;
  return loc;
 }
 #else
 inline int cuda_mem_loc(int i) {
  return i;
 }
 #endif // CUDART_VERSION >= 13000
 // The small pool size in bytes. This should be a multiple of the host page
 // size and small_block_size.
 constexpr int small_pool_size = 4 * page_size;
@@ -48,7 +35,13 @@ SmallSizePool::SmallSizePool() {
  int device_count = 0;
  CHECK_CUDA_ERROR(cudaGetDeviceCount(&device_count));
  for (int i = 0; i < device_count; ++i) {
-    auto loc = cuda_mem_loc(i);
+#if CUDART_VERSION >= 13000
    cudaMemLocation loc;
    loc.type = cudaMemLocationTypeDevice;
    loc.id = i;
 #else
    int loc = i;
 #endif // CUDART_VERSION >= 13000
    CHECK_CUDA_ERROR(
        cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetAccessedBy, loc));
  }
@@ -97,10 +90,9 @@ CudaAllocator::CudaAllocator()
          page_size,
          [](CudaBuffer* buf) { return buf->size; },
          [this](CudaBuffer* buf) { cuda_free(buf); }) {
-  size_t free;
+  size_t free, total;
-  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total_memory_));
+  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
-  memory_limit_ = total_memory_ * 0.95;
+  memory_limit_ = total * 0.95;
  free_limit_ = total_memory_ - memory_limit_;
  max_pool_size_ = memory_limit_;
  int device_count = 0;
@@ -112,10 +104,6 @@ CudaAllocator::CudaAllocator()
    cudaStream_t s;
    CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking));
    free_streams_.push_back(s);
    cudaMemPool_t mem_pool;
    CHECK_CUDA_ERROR(cudaDeviceGetDefaultMemPool(&mem_pool, i));
    mem_pools_.push_back(mem_pool);
  }
  CHECK_CUDA_ERROR(cudaSetDevice(curr));
 }
@@ -131,8 +119,7 @@ void copy_to_managed(CudaBuffer& buf) {
  buf.data = new_data;
 }
-Buffer
+Buffer CudaAllocator::malloc_impl(size_t size, cudaStream_t stream) {
 CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
  if (size == 0) {
    return Buffer{new CudaBuffer{nullptr, 0, -1}};
  }
@@ -147,8 +134,9 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
    size = page_size * ((size + page_size - 1) / page_size);
  }
-  if (size <= small_block_size || stream == nullptr) {
+  int device = -1;
-    device = -1;
+  if (size > small_block_size && stream != nullptr) {
    CHECK_CUDA_ERROR(cudaStreamGetDevice(stream, &device));
  }
  CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
@@ -166,35 +154,19 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
    }
    lock.unlock();
    if (!buf) {
-      void* data = nullptr;
+      buf = new CudaBuffer{nullptr, size, device};
      cudaError_t err;
      if (device == -1) {
-        CHECK_CUDA_ERROR(cudaMallocManaged(&data, size));
+        err = cudaMallocManaged(&buf->data, size);
      } else {
-        CHECK_CUDA_ERROR(cudaMallocAsync(&data, size, stream));
+        err = cudaMallocAsync(&buf->data, size, stream);
      }
-      if (!data) {
+      if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
-        std::ostringstream msg;
+        throw std::runtime_error(fmt::format(
-        msg << "[malloc] Unable to allocate " << size << " bytes.";
+            "cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
        throw std::runtime_error(msg.str());
      }
      buf = new CudaBuffer{data, size, device};
    }
    lock.lock();
    // If any cuda memory pool has too much reserved memory, clear some
    // memory from the cache. This prevents graph / kernel execution failing
    // from OOM
    if (get_cache_memory() > 0) {
      for (auto p : mem_pools_) {
        size_t used = 0;
        CHECK_CUDA_ERROR(cudaMemPoolGetAttribute(
            p, cudaMemPoolAttrReservedMemCurrent, &used));
        if (used > (total_memory_ - free_limit_)) {
          buffer_cache_.release_cached_buffers(free_limit_);
          break;
        }
      }
    }
  }
  active_memory_ += buf->size;
  peak_memory_ = std::max(active_memory_, peak_memory_);
@@ -204,14 +176,18 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
  }
  // Copy to managed here if the buffer is not on the right device
-  if (buf->device >= 0 && buf->device != device) {
+  if (buf->device != device) {
    copy_to_managed(*buf);
  }
  return Buffer{buf};
 }
 Buffer CudaAllocator::malloc_async(size_t size, cudaStream_t stream) {
  return malloc_impl(size, stream);
 }
 Buffer CudaAllocator::malloc(size_t size) {
-  return malloc_async(size, -1, nullptr);
+  return malloc_impl(size, nullptr);
 }
 void CudaAllocator::free(Buffer buffer) {
@@ -247,9 +223,9 @@ void CudaAllocator::cuda_free(CudaBuffer* buf) {
    scalar_pool_.free(buf);
  } else {
    if (buf->device >= 0) {
-      CHECK_CUDA_ERROR(cudaFreeAsync(buf->data, free_streams_[buf->device]));
+      cudaFreeAsync(buf->data, free_streams_[buf->device]);
    } else {
-      CHECK_CUDA_ERROR(cudaFree(buf->data));
+      cudaFree(buf->data);
    }
    delete buf;
  }
@@ -301,9 +277,8 @@ CudaAllocator& allocator() {
  return *allocator_;
 }
-Buffer malloc_async(size_t size, CommandEncoder& encoder) {
+Buffer malloc_async(size_t size, cudaStream_t stream) {
-  auto buffer = allocator().malloc_async(
+  auto buffer = allocator().malloc_async(size, stream);
      size, encoder.device().cuda_device(), encoder.stream());
  if (size && !buffer.ptr()) {
    std::ostringstream msg;
    msg << "[malloc_async] Unable to allocate " << size << " bytes.";
--- a/mlx/backend/cuda/allocator.h
+++ b/mlx/backend/cuda/allocator.h
@@ -13,8 +13,6 @@
 namespace mlx::core::cu {
 class CommandEncoder;
 using allocator::Buffer;
 // Stores cuda-managed unified memory.
@@ -50,7 +48,7 @@ class SmallSizePool {
 class CudaAllocator : public allocator::Allocator {
 public:
  Buffer malloc(size_t size) override;
-  Buffer malloc_async(size_t size, int device, cudaStream_t stream);
+  Buffer malloc_async(size_t size, cudaStream_t stream);
  void free(Buffer buffer) override;
  size_t size(Buffer buffer) const override;
@@ -64,6 +62,7 @@ class CudaAllocator : public allocator::Allocator {
  void clear_cache();
 private:
  Buffer malloc_impl(size_t size, cudaStream_t stream);
  void cuda_free(CudaBuffer* buf);
  CudaAllocator();
@@ -71,19 +70,16 @@ class CudaAllocator : public allocator::Allocator {
  std::mutex mutex_;
  size_t memory_limit_;
  size_t free_limit_;
  size_t total_memory_;
  size_t max_pool_size_;
  BufferCache<CudaBuffer> buffer_cache_;
  size_t active_memory_{0};
  size_t peak_memory_{0};
  std::vector<cudaStream_t> free_streams_;
  std::vector<cudaMemPool_t> mem_pools_;
  SmallSizePool scalar_pool_;
 };
 CudaAllocator& allocator();
-Buffer malloc_async(size_t size, CommandEncoder& encoder);
+Buffer malloc_async(size_t size, cudaStream_t stream);
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/arange.cu
+++ b/mlx/backend/cuda/arange.cu
@@ -42,7 +42,7 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
    return;
  }
  auto& encoder = cu::get_command_encoder(stream());
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  encoder.set_output_array(out);
  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -143,7 +143,7 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  // Prepare the shapes, strides and axis arguments.
  Shape shape = remove_index(in.shape(), axis_);
--- a/mlx/backend/cuda/binary/binary.cuh
+++ b/mlx/backend/cuda/binary/binary.cuh
@@ -367,8 +367,9 @@ void binary_op_gpu(
  auto bopt = get_binary_op_type(a, b);
  auto& encoder = cu::get_command_encoder(s);
-  set_binary_op_output_data(
+  set_binary_op_output_data(a, b, out, bopt, [&](auto n) {
-      a, b, out, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
+    return cu::malloc_async(n, encoder.stream());
  });
  binary_op_gpu_inplace<Op>(inputs, out, op, s);
 }
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -246,10 +246,12 @@ void binary_two_op_gpu_inplace(
  auto& out_b = outputs[1];
  auto bopt = get_binary_op_type(a, b);
  auto& encoder = cu::get_command_encoder(s);
-  set_binary_op_output_data(
+  set_binary_op_output_data(a, b, out_a, bopt, [&](auto n) {
-      a, b, out_a, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
+    return cu::malloc_async(n, encoder.stream());
-  set_binary_op_output_data(
+  });
-      a, b, out_b, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
+  set_binary_op_output_data(a, b, out_b, bopt, [&](auto n) {
    return cu::malloc_async(n, encoder.stream());
  });
  if (out_a.size() == 0) {
    return;
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -298,7 +298,7 @@ void Compiled::eval_gpu(
  // Put outputs.
  compiled_allocate_outputs(
      inputs, outputs, is_constant_, contiguous, [&](auto n) {
-        return cu::malloc_async(n, encoder);
+        return cu::malloc_async(n, encoder.stream());
      });
  for (auto& x : outputs) {
    args.append(x);
--- a/mlx/backend/cuda/conv.cpp
+++ b/mlx/backend/cuda/conv.cpp
@@ -15,16 +15,19 @@ namespace mlx::core {
 namespace {
-enum ConvBackendType {
+// Alias for better readability.
-  CONV_FALLBACK,
+#define CONV_FORWARD CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR
-  CONV_FORWARD,
+#define CONV_BACKWARD_INPUT \
-  CONV_BACKWARD_INPUT,
+  CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
-  CONV_BACKWARD_WEIGHT,
+#define CONV_BACKWARD_WEIGHT \
-};
+  CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
 // Custom placeholder representing fallback kernel.
 #define CONV_FALLBACK static_cast<cudnnBackendDescriptorType_t>(-1)
 struct ConvCacheKey {
  int device_id;
-  fe::DataType_t cudnn_dtype;
+  cudnnDataType_t cudnn_dtype;
  std::array<int, MAX_NDIM> input_shape;
  std::array<int, MAX_NDIM> weight_shape;
  std::array<int, MAX_NDIM> stride;
@@ -41,13 +44,15 @@ struct ConvCacheKey {
 auto& conv_cache() {
  static LRUBytesKeyCache<
      ConvCacheKey,
-      std::pair<ConvBackendType, std::optional<DnnGraph>>>
+      std::pair<
          cudnnBackendDescriptorType_t,
          std::optional<cudnn_frontend::ExecutionPlan>>>
      cache("MLX_CUDA_CONV_CACHE_SIZE", /* default_capacity */ 128);
  return cache;
 }
-auto get_conv_settings(
+auto get_conv_op_settings(
-    ConvBackendType backend_type,
+    cudnnBackendDescriptorType_t backend_type,
    array& x,
    array& w,
    array& y,
@@ -63,8 +68,8 @@ auto get_conv_settings(
    for (int i = 0; i < padding_lo.size(); ++i) {
      int wt_size = 1 + kernel_dilation[i] * (w.shape(1 + i) - 1);
      padding_lo[i] = wt_size - padding_lo[i] - 1;
-      int in_size = 1 + kernel_strides[i] * (y.shape(1 + i) - 1);
+      int in_size = 1 + kernel_strides[i] * (x.shape(1 + i) - 1);
-      int out_size = 1 + input_dilation[i] * (x.shape(1 + i) - 1);
+      int out_size = 1 + input_dilation[i] * (y.shape(1 + i) - 1);
      padding_hi[i] = out_size - in_size + padding_hi[i];
    }
    return std::make_tuple(
@@ -90,57 +95,49 @@ auto get_conv_settings(
  }
 }
-std::optional<DnnGraph> build_conv_graph(
+std::optional<cudnn_frontend::OperationGraph> build_conv_op_graph(
    cu::CommandEncoder& encoder,
-    ConvBackendType backend_type,
+    cudnnBackendDescriptorType_t backend_type,
    Dtype dtype,
    array& x,
    array& w,
    array& y,
-    const std::vector<int64_t>& stride,
+    const SmallVector<int64_t>& stride,
-    const std::vector<int64_t>& padding_lo,
+    const SmallVector<int64_t>& padding_lo,
-    const std::vector<int64_t>& padding_hi,
+    const SmallVector<int64_t>& padding_hi,
-    const std::vector<int64_t>& dilation) {
+    const SmallVector<int64_t>& dilation) {
-  auto compute_dtype =
+  try {
-      (dtype == float16 || dtype == bfloat16) ? float32 : dtype;
+    auto compute_dtype = (dtype == float16 || dtype == bfloat16)
-  DnnGraph graph(encoder.device().cudnn_handle(), dtype, compute_dtype);
+        ? CUDNN_DATA_FLOAT
-  auto x_ = graph.tensor_nchw("X", 'x', x);
+        : dtype_to_cudnn_type(dtype);
-  auto w_ = graph.tensor_nchw("W", 'w', w);
+    auto conv_desc = cudnn_frontend::ConvDescBuilder()
                         .setDataType(compute_dtype)
                         .setMathMode(CUDNN_CROSS_CORRELATION)
                         .setNDims(stride.size())
                         .setStrides(stride.size(), stride.data())
                         .setPrePadding(padding_lo.size(), padding_lo.data())
                         .setPostPadding(padding_hi.size(), padding_hi.data())
                         .setDilation(dilation.size(), dilation.data())
                         .build();
-  auto set_options = [&](auto& options) {
+    auto op = cudnn_frontend::OperationBuilder(backend_type)
-    options.set_compute_data_type(dtype_to_cudnn_type(compute_dtype))
+                  .setxDesc(build_cudnn_tensor_nchw('x', x))
-        .set_convolution_mode(fe::ConvolutionMode_t::CROSS_CORRELATION)
+                  .setwDesc(build_cudnn_tensor_nchw('w', w))
-        .set_stride(stride)
+                  .setyDesc(build_cudnn_tensor_nchw('y', y))
-        .set_pre_padding(padding_lo)
+                  .setcDesc(conv_desc)
-        .set_post_padding(padding_hi)
+                  .build();
        .set_dilation(dilation);
  };
-  std::shared_ptr<fe::graph::Tensor_attributes> y_;
+    std::array<cudnn_frontend::Operation const*, 1> ops = {&op};
-  if (backend_type == CONV_FORWARD) {
+    return cudnn_frontend::OperationGraphBuilder()
-    auto options = fe::graph::Conv_fprop_attributes();
+        .setHandle(encoder.device().cudnn_handle())
-    set_options(options);
+        .setOperationGraph(ops.size(), ops.data())
-    y_ = graph.conv_fprop(x_, w_, options);
+        .build();
-  } else if (backend_type == CONV_BACKWARD_INPUT) {
+  } catch (cudnn_frontend::cudnnException& error) {
-    auto options = fe::graph::Conv_dgrad_attributes();
+    if (error.getCudnnStatus() != CUDNN_STATUS_BAD_PARAM) {
-    set_options(options);
+      throw;
-    y_ = graph.conv_dgrad(x_, w_, options);
+    }
  } else if (backend_type == CONV_BACKWARD_WEIGHT) {
    auto options = fe::graph::Conv_wgrad_attributes();
    set_options(options);
    y_ = graph.conv_wgrad(w_, x_, options);
  }
  graph.tensor_nchw(y_, 'y', y)->set_output(true);
  if (graph.prepare().is_bad()) {
    return std::nullopt;
  }
  graph.deselect_numeric_notes({fe::NumericalNote_t::DOWN_CONVERT_INPUTS});
  if (dtype == float32 && !env::enable_tf32()) {
    graph.deselect_numeric_notes({fe::NumericalNote_t::TENSOR_CORE});
  }
  CHECK_CUDNN_FE_ERROR(graph.build());
  return graph;
 }
 // Transpose from (C_out, H, W, C_in / groups) to (C_in, H, W, C_out / groups).
@@ -184,7 +181,7 @@ array group_transpose(
 // eval_gpu, with cost of possible redundant copies.
 std::tuple<array, array, array> prepare_args(
    cu::CommandEncoder& encoder,
-    ConvBackendType backend_type,
+    cudnnBackendDescriptorType_t backend_type,
    array in,
    array wt,
    array out,
@@ -224,11 +221,27 @@ std::tuple<array, array, array> prepare_args(
  return {std::move(in), std::move(wt), std::move(out)};
 }
 // Get the x/w/y args from the in/wt/out args depending on backend type.
 inline std::tuple<array&, array&, array&> dispatch_args(
    cudnnBackendDescriptorType_t backend_type,
    array& in,
    array& wt,
    array& out) {
  switch (backend_type) {
    case CONV_BACKWARD_INPUT:
      return {out, wt, in};
    case CONV_BACKWARD_WEIGHT:
      return {in, out, wt};
    default:
      return {in, wt, out};
  }
 }
 // Register inputs and outputs before actually running conv op. Can only be
 // called once per eval_gpu.
 void register_args(
    cu::CommandEncoder& encoder,
-    ConvBackendType backend_type,
+    cudnnBackendDescriptorType_t backend_type,
    array& in,
    array& wt,
    array& intermediate_out,
@@ -264,7 +277,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  array in = inputs[0];
  array wt = inputs[1];
  array out = out_;
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  Dtype dtype = out.dtype();
  // Search cache.
@@ -284,19 +297,16 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
      get_alignment(wt),
      get_alignment(out)};
  if (auto it = conv_cache().find(cache_key); it != conv_cache().end()) {
-    auto& [backend_type, graph] = it->second;
+    auto& [backend_type, plan] = it->second;
-    if (graph) {
+    if (plan) {
-      // Run cached graph.
+      // Run cached plan.
      std::tie(in, wt, out) =
          prepare_args(encoder, backend_type, in, wt, out, groups_, s);
      register_args(encoder, backend_type, in, wt, out, out_);
-      CHECK_CUDNN_FE_ERROR(graph->encode_capturing(
+      auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
-          encoder,
+      if (!encode_cudnn_plan(encoder, *plan, {'x', 'w', 'y'}, x, w, y)) {
-          {
+        throw std::runtime_error("[conv] Cached plan failed to execute.");
-              {'x', gpu_ptr<void>(in)},
+      }
              {'w', gpu_ptr<void>(wt)},
              {'y', gpu_ptr<void>(out)},
          }));
    } else {
      // Run fallback kernel.
      gemm_conv(
@@ -317,7 +327,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  // There is no reliable way to deduce the proper cuDNN backend for the
  // convolution, so we make a best guess and then try.
-  SmallVector<ConvBackendType, 2> try_backends;
+  SmallVector<cudnnBackendDescriptorType_t, 2> try_backends;
  if (flip_) {
    // When weight is flipped, we assume it is backward input convolution.
    try_backends.push_back(CONV_BACKWARD_INPUT);
@@ -335,12 +345,13 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  }
  // Try to build op graph.
-  ConvBackendType backend_type;
+  cudnnBackendDescriptorType_t backend_type;
-  std::optional<DnnGraph> graph;
+  std::optional<cudnn_frontend::OperationGraph> op_graph;
  for (auto try_backend : try_backends) {
-    auto [x, w, y] =
+    auto [in_copy, wt_copy, out_copy] =
        prepare_args(encoder, try_backend, in, wt, out, groups_, s);
-    auto [stride, padding_lo, padding_hi, dilation] = get_conv_settings(
+    auto [x, w, y] = dispatch_args(try_backend, in_copy, wt_copy, out_copy);
    auto [stride, padding_lo, padding_hi, dilation] = get_conv_op_settings(
        try_backend,
        x,
        w,
@@ -350,7 +361,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
        padding_hi_,
        kernel_dilation_,
        input_dilation_);
-    graph = build_conv_graph(
+    op_graph = build_conv_op_graph(
        encoder,
        try_backend,
        dtype,
@@ -361,27 +372,30 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
        padding_lo,
        padding_hi,
        dilation);
-    if (graph) {
+    if (op_graph) {
      backend_type = try_backend;
-      in = std::move(x);
+      in = std::move(in_copy);
-      wt = std::move(w);
+      wt = std::move(wt_copy);
-      out = std::move(y);
+      out = std::move(out_copy);
      break;
    }
  }
-  if (graph) {
+  if (op_graph) {
-    register_args(encoder, backend_type, in, wt, out, out_);
+    // Find a plan for the graph and execute it.
-    CHECK_CUDNN_FE_ERROR(graph->encode_capturing(
+    auto plan = find_cudnn_plan_from_op_graph(
-        encoder,
+        encoder.device().cudnn_handle(), backend_type, dtype, *op_graph);
-        {
+    if (plan) {
-            {'x', gpu_ptr<void>(in)},
+      // Setup inputs and outputs.
-            {'w', gpu_ptr<void>(wt)},
+      register_args(encoder, backend_type, in, wt, out, out_);
-            {'y', gpu_ptr<void>(out)},
+
-        }));
+      auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
-    conv_cache().emplace(
+      if (encode_cudnn_plan(encoder, *plan, {'x', 'w', 'y'}, x, w, y)) {
-        cache_key, std::make_pair(backend_type, std::move(*graph)));
+        conv_cache().emplace(
-    return;
+            cache_key, std::make_pair(backend_type, std::move(*plan)));
        return;
      }
    }
  }
  // Use fallback kernel for settings not supported by cuDNN.
--- a/mlx/backend/cuda/conv/gemm_conv.cu
+++ b/mlx/backend/cuda/conv/gemm_conv.cu
@@ -86,7 +86,7 @@ array unfold_inputs_nd(
    int mat_N,
    ConvParams<NDIM>& params) {
  array unfolded({mat_M, mat_K}, in.dtype(), nullptr, {});
-  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
+  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder.stream()));
  encoder.add_temporary(unfolded);
  int filter_size = params.C;
--- a/mlx/backend/cuda/conv/gemm_grouped_conv.cu
+++ b/mlx/backend/cuda/conv/gemm_grouped_conv.cu
@@ -89,7 +89,7 @@ array grouped_unfold_transpose_inputs_nd(
    int mat_N,
    ConvParams<NDIM>& params) {
  array unfolded({mat_M, mat_K * params.groups}, in.dtype(), nullptr, {});
-  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
+  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder.stream()));
  encoder.add_temporary(unfolded);
  int filter_size = params.C;
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -7,8 +7,9 @@ namespace mlx::core {
 void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  auto& encoder = cu::get_command_encoder(s);
-  bool donated = set_copy_output_data(
+  bool donated = set_copy_output_data(in, out, ctype, [&](auto n) {
-      in, out, ctype, [&](auto n) { return cu::malloc_async(n, encoder); });
+    return cu::malloc_async(n, encoder.stream());
  });
  if (donated && in.dtype() == out.dtype()) {
    // If the output has the same type as the input then there is nothing to
    // copy, just use the buffer.
@@ -103,7 +104,7 @@ void fill_gpu(const array& in, array& out, const Stream& s) {
    return;
  }
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
@@ -113,7 +114,7 @@ void reshape_gpu(const array& in, array& out, Stream s) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
  if (copy_necessary) {
    auto& encoder = cu::get_command_encoder(s);
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    copy_gpu_inplace(
        in,
        out,
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -95,14 +95,11 @@ void copy_general_input(
            const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
            OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
            int ndim = shape.size();
-
+            int work_per_thread = 1;
            int work_per_thread = 8;
            auto dim0 = ndim > 0 ? shape.back() : 1;
            auto rest = out.size() / dim0;
-            if (dim0 >= 4 && dim0 < 8) {
+            if (dim0 >= 4) {
              work_per_thread = 4;
            } else if (dim0 < 4) {
              work_per_thread = 1;
            }
            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
            auto block_dims = get_block_dims(dim0, rest, 1);
@@ -113,10 +110,7 @@ void copy_general_input(
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
                auto kernel =
                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 1>;
-                if (work_per_thread == 8) {
+                if (work_per_thread == 4) {
                  kernel =
                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 8>;
                } else if (work_per_thread == 4) {
                  kernel =
                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 4>;
                }
@@ -133,9 +127,7 @@ void copy_general_input(
              });
            } else { // ndim >= 4
              auto kernel = cu::copy_g<InType, OutType, IdxT, 1>;
-              if (work_per_thread == 8) {
+              if (work_per_thread == 4) {
                kernel = cu::copy_g<InType, OutType, IdxT, 8>;
              } else if (work_per_thread == 4) {
                kernel = cu::copy_g<InType, OutType, IdxT, 4>;
              }
              encoder.add_kernel_node(
--- a/mlx/backend/cuda/cuda_utils.h
+++ b/mlx/backend/cuda/cuda_utils.h
@@ -5,7 +5,6 @@
 #include <cublasLt.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cudnn.h>
 namespace mlx::core {
@@ -13,12 +12,10 @@ namespace mlx::core {
 void check_cublas_error(const char* name, cublasStatus_t err);
 void check_cuda_error(const char* name, cudaError_t err);
 void check_cuda_error(const char* name, CUresult err);
 void check_cudnn_error(const char* name, cudnnStatus_t err);
 // The macro version that prints the command that failed.
 #define CHECK_CUBLAS_ERROR(cmd) check_cublas_error(#cmd, (cmd))
 #define CHECK_CUDA_ERROR(cmd) check_cuda_error(#cmd, (cmd))
 #define CHECK_CUDNN_ERROR(cmd) check_cudnn_error(#cmd, (cmd))
 // Base class for RAII managed CUDA resources.
 template <typename Handle, cudaError_t (*Destroy)(Handle)>
@@ -32,10 +29,6 @@ class CudaHandle {
  }
  ~CudaHandle() {
    // Skip if there was an error to avoid throwing in the destructors
    if (cudaPeekAtLastError() != cudaSuccess) {
      return;
    }
    reset();
  }
--- a/mlx/backend/cuda/cudnn_utils.cpp
+++ b/mlx/backend/cuda/cudnn_utils.cpp
@@ -7,26 +7,32 @@ namespace mlx::core {
 namespace {
-#define RETURN_IF_ERROR(cmd)          \
+// Create a cudnn tensor descriptor.
-  if (auto ret = cmd; ret.is_bad()) { \
+template <typename Vec>
-    return ret;                       \
+inline cudnn_frontend::Tensor build_cudnn_tensor(
-  }
+    int64_t id,
    const array& x,
    const Vec& shape,
    const Vec& strides) {
  return cudnn_frontend::TensorBuilder()
      .setDim(shape.size(), shape.data())
      .setStrides(strides.size(), strides.data())
      .setId(id)
      .setAlignment(get_alignment(x))
      .setDataType(dtype_to_cudnn_type(x.dtype()))
      .build();
 }
 // In MLX a singleton dim (shape[dim] == 1) can have any stride, but in cuDNN
 // whether a tensor is contiguous is determined with:
 // shape[dim] == shape[dim + 1] * strides[dim + 1]
 // So a contiguous array with singleton dims in MLX may be mistakenly treated
 // as strided in cuDNN, and we work around it by normalizing the strides.
-std::vector<int64_t> normalized_strides(const array& x) {
+Strides normalized_strides(const array& x) {
  std::vector<int64_t> strides(x.strides().begin(), x.strides().end());
  if (std::all_of(
          strides.begin(), strides.end(), [](int64_t s) { return s == 0; })) {
    strides.back() = 1;
    return strides;
  }
  if (!x.flags().row_contiguous || x.ndim() < 2) {
-    return strides;
+    return x.strides();
  }
  Strides strides = x.strides();
  for (int i = x.ndim() - 2; i >= 0; --i) {
    if (x.shape(i) == 1) {
      strides[i] = x.shape(i + 1) * strides[i + 1];
@@ -36,9 +42,7 @@ std::vector<int64_t> normalized_strides(const array& x) {
 }
 // Return the shape and strides after transposing from NHWC to NCHW.
-inline auto nhwc_to_nchw(const array& x) {
+auto nhwc_to_nchw(SmallVector<int64_t> shape, SmallVector<int64_t> strides) {
  auto shape = convert_vector<int64_t>(x.shape());
  auto strides = normalized_strides(x);
  assert(shape.size() >= 3);
  shape.insert(shape.begin() + 1, shape.back());
  shape.erase(shape.end() - 1);
@@ -47,95 +51,228 @@ inline auto nhwc_to_nchw(const array& x) {
  return std::make_tuple(std::move(shape), std::move(strides));
 }
-} // namespace
+inline auto nhwc_to_nchw(const array& x) {
  return nhwc_to_nchw(
      convert_vector<int64_t>(x.shape()), normalized_strides(x));
 }
-fe::error_t DnnGraph::prepare() {
+// Return available engines for a |op_graph|.
-  RETURN_IF_ERROR(validate());
+cudnn_frontend::EngineConfigList get_cudnn_engine_configs(
-  try {
+    cudnnBackendDescriptorType_t backend_type,
-    RETURN_IF_ERROR(build_operation_graph(handle_));
+    Dtype dtype,
-  } catch (cudnn_frontend::cudnnException& error) {
+    cudnn_frontend::OperationGraph& op_graph,
-    // cuDNN bug: they did not catch all exceptions in the API.
+    bool use_fallback = true) {
-    return {fe::error_code_t::CUDNN_BACKEND_API_FAILED, error.what()};
+  SmallVector<cudnn_frontend::GeneratorSource, 2> sources;
  sources.push_back([](auto& op_graph) {
    auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
                          .setOperationGraph(op_graph)
                          .setHeurMode(CUDNN_HEUR_MODE_A)
                          .build();
    return heuristics.getEngineConfig(heuristics.getEngineConfigCount());
  });
  if (use_fallback) {
    sources.push_back([&backend_type](auto& op_graph) {
      auto fallback = cudnn_frontend::EngineFallbackListBuilder()
                          .setOperationGraph(op_graph)
                          .setOperation(backend_type)
                          .build();
      return fallback.getFallbackList();
    });
  }
-  RETURN_IF_ERROR(create_execution_plans({fe::HeurMode_t::A}));
+
-  return {};
+  auto configs =
      cudnn_frontend::EngineConfigGenerator(sources.size(), sources.data())
          .generate_engine_config(op_graph);
  cudnn_frontend::EngineConfigList filtered_configs;
  cudnn_frontend::filter(configs, filtered_configs, [dtype](auto c) {
    if (cudnn_frontend::hasNumericalNote<
            CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) {
      return true;
    }
    if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(c) &&
        dtype == float32 && !env::enable_tf32()) {
      return true;
    }
    return false;
  });
  return filtered_configs;
 }
-fe::error_t DnnGraph::build() {
+// Take |engine_configs| and |op_graph| and find a working execution plans
-  RETURN_IF_ERROR(check_support(handle_));
+// from them.
-  RETURN_IF_ERROR(build_plans(handle_));
+std::optional<cudnn_frontend::ExecutionPlan>
-  return {};
+find_cudnn_plan_from_engine_configs(
-}
+    cudnnHandle_t handle,
-
+    const cudnn_frontend::EngineConfigList& engine_configs,
-fe::error_t DnnGraph::encode_graph(
+    const cudnn_frontend::OperationGraph& op_graph) {
-    cu::CommandEncoder& encoder,
+  auto op_graph_tag = op_graph.getTag();
-    std::unordered_map<int64_t, void*> variant_pack) {
+  for (const auto& config : engine_configs) {
-  cudnnSetStream(handle_, encoder.stream());
+    try {
-  CudaGraph cuda_graph(encoder.device());
+      return cudnn_frontend::ExecutionPlanBuilder()
-  RETURN_IF_ERROR(populate_cuda_graph(
+          .setHandle(handle)
-      handle_, variant_pack, prepare_workspace(encoder), cuda_graph));
+          .setEngineConfig(config, op_graph_tag)
-  encoder.add_graph_node(cuda_graph);
+          .build();
-  return {};
+    } catch (cudnn_frontend::cudnnException& error) {
-}
+      if (error.getCudnnStatus() != CUDNN_STATUS_NOT_SUPPORTED) {
-
+        throw;
-fe::error_t DnnGraph::encode_capturing(
+      }
-    cu::CommandEncoder& encoder,
+    }
    std::unordered_map<int64_t, void*> variant_pack) {
  auto* workspace_ptr = prepare_workspace(encoder);
  auto capture = encoder.capture_context();
  cudnnSetStream(handle_, encoder.stream());
  auto ret = execute(handle_, variant_pack, workspace_ptr);
  if (ret.is_bad()) {
    capture.discard = true;
  }
-  return ret;
+  return std::nullopt;
 }
-void* DnnGraph::prepare_workspace(cu::CommandEncoder& encoder) {
+// Prepare workspace and args to execute plan.
-  int64_t workspace_size = 0;
+template <typename F>
-  CHECK_CUDNN_FE_ERROR(get_workspace_size(workspace_size));
+bool prepare_cudnn_plan(
    cu::CommandEncoder& encoder,
    cudnn_frontend::ExecutionPlan& plan,
    int num_args,
    const int64_t* uids,
    void** data_ptrs,
    F&& execute) {
  int workspace_size = plan.getWorkspaceSize();
  void* workspace_ptr = nullptr;
  if (workspace_size > 0) {
    array workspace(
-        cu::malloc_async(workspace_size, encoder),
+        cu::malloc_async(workspace_size, encoder.stream()),
-        {static_cast<int>(workspace_size)},
+        {workspace_size},
        uint8);
    encoder.add_temporary(workspace);
-    return gpu_ptr<void>(workspace);
+    workspace_ptr = gpu_ptr<void>(workspace);
  }
-  return nullptr;
+
  auto args = cudnn_frontend::VariantPackBuilder()
                  .setWorkspacePointer(workspace_ptr)
                  .setDataPointers(num_args, data_ptrs)
                  .setUids(num_args, uids)
                  .build();
  auto handle = encoder.device().cudnn_handle();
  cudnnSetStream(handle, encoder.stream());
  if (!execute(handle, plan.get_raw_desc(), args.get_raw_desc())) {
    return false;
  }
  return true;
 }
-void DnnGraph::set_tensor_attrs(
+} // namespace
-    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
+
-    int64_t uid,
+cudnn_frontend::Tensor build_cudnn_tensor(int64_t id, const array& x) {
-    const array& x,
+  auto shape = convert_vector<int64_t>(x.shape());
-    const std::vector<int64_t>& shape,
+  return build_cudnn_tensor(id, x, shape, normalized_strides(x));
    const std::vector<int64_t>& strides) {
  tensor->set_uid(uid)
      .set_alignment(get_alignment(x))
      .set_data_type(dtype_to_cudnn_type(x.dtype()))
      .set_dim(shape)
      .set_stride(strides);
 }
-void DnnGraph::set_tensor_attrs(
+cudnn_frontend::Tensor build_cudnn_tensor_nchw(int64_t id, const array& x) {
    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
    int64_t uid,
    const array& x) {
  set_tensor_attrs(
      tensor,
      uid,
      x,
      convert_vector<int64_t>(x.shape()),
      normalized_strides(x));
 }
 void DnnGraph::set_tensor_attrs_nchw(
    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
    int64_t uid,
    const array& x) {
  auto [shape, strides] = nhwc_to_nchw(x);
-  set_tensor_attrs(tensor, uid, x, shape, strides);
+  return build_cudnn_tensor(id, x, shape, strides);
 }
 cudnn_frontend::Tensor build_cudnn_tensor_4d_nchw(int64_t id, const array& x) {
  if (x.ndim() == 0) {
    SmallVector<int64_t, 4> scalar_dims = {1, 1, 1, 1};
    return build_cudnn_tensor(id, x, scalar_dims, scalar_dims);
  }
  if (x.ndim() == 1) {
    int64_t s = x.shape(0);
    SmallVector<int64_t, 4> shape = {1, x.shape(0), 1, 1};
    SmallVector<int64_t, 4> strides = {s, 1, s, s};
    return build_cudnn_tensor(id, x, shape, strides);
  }
  if (x.ndim() == 2) {
    int64_t s =
        x.flags().row_contiguous ? x.shape(1) * x.strides(1) : x.strides(0);
    SmallVector<int64_t, 4> shape = {x.shape(0), x.shape(1), 1, 1};
    SmallVector<int64_t, 4> strides = {s, x.strides(1), s, s};
    return build_cudnn_tensor(id, x, shape, strides);
  }
  if (x.ndim() == 3 || x.ndim() == 4) {
    return build_cudnn_tensor_nchw(id, x);
  }
  throw std::runtime_error(
      fmt::format("Unsupported array with {} dims.", x.ndim()));
 }
 cudnn_frontend::Tensor build_cudnn_scalar_4d(int64_t id, Dtype dtype) {
  SmallVector<int64_t, 4> scalar_dims = {1, 1, 1, 1};
  return cudnn_frontend::TensorBuilder()
      .setDim(scalar_dims.size(), scalar_dims.data())
      .setStrides(scalar_dims.size(), scalar_dims.data())
      .setId(id)
      .setAlignment(16)
      .setDataType(dtype_to_cudnn_type(dtype))
      .setByValue(true)
      .build();
 }
 std::optional<cudnn_frontend::ExecutionPlan> find_cudnn_plan_from_op_graph(
    cudnnHandle_t handle,
    cudnnBackendDescriptorType_t backend_type,
    Dtype dtype,
    cudnn_frontend::OperationGraph& op_graph) {
  auto engine_configs = get_cudnn_engine_configs(backend_type, dtype, op_graph);
  if (engine_configs.empty()) {
    return std::nullopt;
  }
  return find_cudnn_plan_from_engine_configs(handle, engine_configs, op_graph);
 }
 bool encode_cudnn_plan_with_capturing(
    cu::CommandEncoder& encoder,
    cudnn_frontend::ExecutionPlan& plan,
    int num_args,
    const int64_t* uids,
    void** data_ptrs) {
  return prepare_cudnn_plan(
      encoder,
      plan,
      num_args,
      uids,
      data_ptrs,
      [&](auto handle, auto plan, auto args) {
        auto capture = encoder.capture_context();
        if (cudnnBackendExecute(handle, plan, args) != CUDNN_STATUS_SUCCESS) {
          // Discard the captured graph when failed.
          capture.discard = true;
          return false;
        }
        return true;
      });
 }
 #if CUDNN_VERSION >= 90500
 bool encode_cudnn_plan_with_graph_api(
    cu::CommandEncoder& encoder,
    cudnn_frontend::ExecutionPlan& plan,
    CudaGraph& graph,
    int num_args,
    const int64_t* uids,
    void** data_ptrs) {
  return prepare_cudnn_plan(
      encoder,
      plan,
      num_args,
      uids,
      data_ptrs,
      [&](auto handle, auto plan, auto args) {
        if (!graph) {
          graph = CudaGraph(encoder.device());
          if (cudnnBackendPopulateCudaGraph(handle, plan, args, graph) !=
              CUDNN_STATUS_SUCCESS) {
            return false;
          }
        } else {
          if (cudnnBackendUpdateCudaGraph(handle, plan, args, graph) !=
              CUDNN_STATUS_SUCCESS) {
            return false;
          }
        }
        encoder.add_graph_node(graph);
        return true;
      });
 }
 #endif
 } // namespace mlx::core
--- a/mlx/backend/cuda/cudnn_utils.h
+++ b/mlx/backend/cuda/cudnn_utils.h
@@ -2,30 +2,25 @@
 #pragma once
 #include "mlx/array.h"
 #include "mlx/backend/cuda/allocator.h"
 #include "mlx/backend/cuda/device/config.h"
 #include "mlx/backend/cuda/utils.h"
 #include "mlx/dtype_utils.h"
 #include <cudnn_frontend.h>
 #include <cudnn_frontend_find_plan.h>
 #include <fmt/format.h>
 #include <algorithm>
 #include <array>
 namespace mlx::core {
 namespace cu {
 class CommandEncoder;
 }
 namespace fe = cudnn_frontend;
 #define CHECK_CUDNN_FE_ERROR(cmd)                                    \
  do {                                                               \
    auto error = cmd;                                                \
    if (!error.is_good()) {                                          \
      throw std::runtime_error(                                      \
          fmt::format("{} failed: {}.", #cmd, error.get_message())); \
    }                                                                \
  } while (0)
 // Return pointer alignment of |x|'s data.
 inline uint8_t get_alignment(const array& x) {
  uint8_t alignment = 1;
@@ -40,31 +35,8 @@ inline uint8_t get_alignment(const array& x) {
 // Convert the type of elements in |vec| to |T|.
 template <typename T, typename Vec>
-inline std::vector<T> convert_vector(const Vec& vec) {
+inline SmallVector<T> convert_vector(const Vec& vec) {
-  return std::vector<T>(vec.begin(), vec.end());
+  return SmallVector<T>(vec.begin(), vec.end());
 }
 // Map dtype to cudnn data type.
 inline fe::DataType_t dtype_to_cudnn_type(Dtype dtype) {
  switch (dtype) {
    case int8:
      return fe::DataType_t::INT8;
    case int32:
      return fe::DataType_t::INT32;
    case uint8:
      return fe::DataType_t::UINT8;
    case float16:
      return fe::DataType_t::HALF;
    case bfloat16:
      return fe::DataType_t::BFLOAT16;
    case float32:
      return fe::DataType_t::FLOAT;
    case float64:
      return fe::DataType_t::DOUBLE;
    default:
      throw std::runtime_error(fmt::format(
          "Unsupported dtype in cuDNN: {}.", dtype_to_string(dtype)));
  }
 }
 // Return an array that can be used as map key for |vec| with size <= MAX_NDIM.
@@ -83,89 +55,111 @@ inline std::array<T, NDIM> vector_key(const Vec<T>& vec) {
  return result;
 }
-// Extends cuDNN graph with helpers.
+// Helpers used by get_data_ptrs to get pointers.
-class DnnGraph : public fe::graph::Graph {
+inline void* get_data_ptr(const array& arr) {
- public:
+  return const_cast<void*>(gpu_ptr<void>(arr));
-  DnnGraph(cudnnHandle_t handle, Dtype io_dtype, Dtype compute_dtype = float32)
+}
-      : handle_(handle) {
+
-    set_io_data_type(dtype_to_cudnn_type(io_dtype));
+template <typename T, typename = std::enable_if_t<std::is_scalar_v<T>>>
-    set_intermediate_data_type(dtype_to_cudnn_type(compute_dtype));
+inline void* get_data_ptr(T& scalar) {
-    set_compute_data_type(dtype_to_cudnn_type(compute_dtype));
+  return &scalar;
 }
 // Return an array filled with data pointers of args.
 template <typename... Args>
 inline std::array<void*, sizeof...(Args)> get_data_ptrs(Args&... args) {
  return {get_data_ptr(args)...};
 }
 // Map dtype to cudnn data type.
 inline cudnnDataType_t dtype_to_cudnn_type(Dtype dtype) {
  switch (dtype) {
    case int8:
      return CUDNN_DATA_INT8;
    case int32:
      return CUDNN_DATA_INT32;
    case uint8:
      return CUDNN_DATA_UINT8;
    case float16:
      return CUDNN_DATA_HALF;
    case bfloat16:
      return CUDNN_DATA_BFLOAT16;
    case float32:
      return CUDNN_DATA_FLOAT;
    case float64:
      return CUDNN_DATA_DOUBLE;
    default:
      throw std::runtime_error(fmt::format(
          "Unsupported dtype in Convolution: {}.", dtype_to_string(dtype)));
  }
 }
-  // Create a cuDNN tensor description from MLX array |x|.
+// Create a tensor descriptor from |x|.
-  auto& tensor(
+cudnn_frontend::Tensor build_cudnn_tensor(int64_t id, const array& x);
      std::shared_ptr<fe::graph::Tensor_attributes>& attrs,
      int64_t uid,
      const array& x) {
    set_tensor_attrs(attrs, uid, x);
    return attrs;
  }
  auto tensor(const char* name, int64_t uid, const array& x) {
    auto attrs = Graph::tensor(fe::graph::Tensor_attributes().set_name(name));
    tensor(attrs, uid, x);
    return attrs;
  }
-  // Create a cuDNN tensor description from MLX array |x|, and transpose it from
+// Create a tensor descriptor from |x|, and transpose from NHWC to NCHW.
-  // NHWC layout to NCHW.
+cudnn_frontend::Tensor build_cudnn_tensor_nchw(int64_t id, const array& x);
  auto& tensor_nchw(
      std::shared_ptr<fe::graph::Tensor_attributes>& attrs,
      int64_t uid,
      const array& x) {
    set_tensor_attrs_nchw(attrs, uid, x);
    return attrs;
  }
  auto tensor_nchw(const char* name, int64_t uid, const array& x) {
    auto attrs = Graph::tensor(fe::graph::Tensor_attributes().set_name(name));
    tensor_nchw(attrs, uid, x);
    return attrs;
  }
-  // Create a cuDNN tensor for scalar.
+// Create a tensor descriptor from |x|, make sure it is 4D, and transpose it
-  auto scalar(const char* name, int64_t uid, Dtype dtype) {
+// from NHWC to NCHW.
-    return Graph::tensor(fe::graph::Tensor_attributes()
+cudnn_frontend::Tensor build_cudnn_tensor_4d_nchw(int64_t id, const array& x);
                             .set_name(name)
                             .set_uid(uid)
                             .set_dim({1, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
                             .set_is_pass_by_value(true)
                             .set_data_type(dtype_to_cudnn_type(dtype)));
  }
-  // Call this before setting notes.
+// Create a 4D scalar tensor descriptor, which is passed by value.
-  fe::error_t prepare();
+cudnn_frontend::Tensor build_cudnn_scalar_4d(int64_t id, Dtype dtype);
  // Call this after setting notes.
  fe::error_t build();
-  // Add cuDNN graph to CUDA graph, using native CUDA graph API.
+// Find a working plan for |op_graph|.
-  fe::error_t encode_graph(
+std::optional<cudnn_frontend::ExecutionPlan> find_cudnn_plan_from_op_graph(
-      cu::CommandEncoder& encoder,
+    cudnnHandle_t handle,
-      std::unordered_map<int64_t, void*> variant_pack);
+    cudnnBackendDescriptorType_t backend_type,
-  // Add cuDNN graph to CUDA graph, using stream capture.
+    Dtype dtype,
-  fe::error_t encode_capturing(
+    cudnn_frontend::OperationGraph& op_graph);
      cu::CommandEncoder& encoder,
      std::unordered_map<int64_t, void*> variant_pack);
- private:
+// Encode the plan to command buffer by capturing.
-  void* prepare_workspace(cu::CommandEncoder& encoder);
+bool encode_cudnn_plan_with_capturing(
    cu::CommandEncoder& encoder,
    cudnn_frontend::ExecutionPlan& plan,
    int num_args,
    const int64_t* uids,
    void** data_ptrs);
-  void set_tensor_attrs(
+#if CUDNN_VERSION >= 90500
-      std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
+// Encode the plan to command buffer by using native graph api of cudnn. If the
-      int64_t uid,
+// |graph| is empty it will be populated, otherwise it will be updated.
-      const array& x,
+bool encode_cudnn_plan_with_graph_api(
-      const std::vector<int64_t>& shape,
+    cu::CommandEncoder& encoder,
-      const std::vector<int64_t>& strides);
+    cudnn_frontend::ExecutionPlan& plan,
-  void set_tensor_attrs(
+    CudaGraph& graph,
-      std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
+    int num_args,
-      int64_t uid,
+    const int64_t* uids,
-      const array& x);
+    void** data_ptrs);
-  void set_tensor_attrs_nchw(
+#endif
      std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
      int64_t uid,
      const array& x);
-  cudnnHandle_t handle_;
+// Helpers to make calls like encode_cudnn_plan(..., {'x', 'y', 'z'}, x, y, z).
-};
+template <typename... Args>
 bool encode_cudnn_plan(
    cu::CommandEncoder& encoder,
    cudnn_frontend::ExecutionPlan& plan,
    std::initializer_list<int64_t> uids,
    Args&... args) {
  assert(uids.size() == sizeof...(args));
  auto data_ptrs = get_data_ptrs(args...);
  return encode_cudnn_plan_with_capturing(
      encoder, plan, uids.size(), uids.begin(), data_ptrs.data());
 }
 #if CUDNN_VERSION >= 90500
 template <typename... Args>
 bool encode_cudnn_plan(
    cu::CommandEncoder& encoder,
    cudnn_frontend::ExecutionPlan& plan,
    CudaGraph& graph,
    std::initializer_list<int64_t> uids,
    Args&... args) {
  assert(uids.size() == sizeof...(args));
  auto data_ptrs = get_data_ptrs(args...);
  return encode_cudnn_plan_with_graph_api(
      encoder, plan, graph, uids.size(), uids.begin(), data_ptrs.data());
 }
 #endif
 } // namespace mlx::core
--- a/mlx/backend/cuda/custom_kernel.cpp
+++ b/mlx/backend/cuda/custom_kernel.cpp
@@ -289,7 +289,7 @@ void CustomKernel::eval_gpu(
      copies.emplace_back(init_value_.value(), out.dtype());
      fill_gpu(copies.back(), out, s);
    } else {
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    }
  }
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -14,20 +14,20 @@ namespace mlx::core::cu {
 namespace {
-bool use_cuda_graphs() {
+#define CHECK_CUDNN_ERROR(cmd) check_cudnn_error(#cmd, (cmd))
-  static bool use_graphs = env::get_var("MLX_USE_CUDA_GRAPHS", true);
+
-  return use_graphs;
+void check_cudnn_error(const char* name, cudnnStatus_t err) {
  if (err != CUDNN_STATUS_SUCCESS) {
    throw std::runtime_error(
        fmt::format("{} failed: {}.", name, cudnnGetErrorString(err)));
  }
 }
-const char* save_cuda_graphs_dot_file() {
+bool use_cuda_graphs() {
-  static const char* filename = []() -> const char* {
+  static bool use_graphs = []() {
-    const char* env = std::getenv("MLX_SAVE_CUDA_GRAPHS_DOT_FILE");
+    return env::get_var("MLX_USE_CUDA_GRAPHS", true);
    if (env && std::strlen(env) == 0) {
      return nullptr;
    }
    return env;
  }();
-  return filename;
+  return use_graphs;
 }
 } // namespace
@@ -46,7 +46,6 @@ Device::Device(int device) : device_(device) {
        "Device {} does not support synchronization in managed memory.",
        device_));
  }
  // The cublasLt handle is used by matmul.
  make_current();
  CHECK_CUBLAS_ERROR(cublasLtCreate(&lt_));
@@ -87,7 +86,7 @@ CommandEncoder::CaptureContext::CaptureContext(CommandEncoder& enc) : enc(enc) {
    return;
  }
  CHECK_CUDA_ERROR(
-      cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeThreadLocal));
+      cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeGlobal));
 }
 CommandEncoder::CaptureContext::~CaptureContext() {
@@ -115,17 +114,18 @@ CommandEncoder::ConcurrentContext::~ConcurrentContext() {
  }
  // Use an empty graph node for synchronization
-  CommandEncoder::GraphNode empty{NULL, "E", std::to_string(enc.node_count_++)};
+  CommandEncoder::GraphNode empty{NULL, 'E', std::to_string(enc.node_count_++)};
  enc.empty_node_count_++;
  CHECK_CUDA_ERROR(cudaGraphAddEmptyNode(&empty.node, enc.graph_, NULL, 0));
  // Insert the concurrent -> empty node dependencies
  for (auto& from : enc.concurrent_nodes_) {
    enc.from_nodes_.push_back(from.node);
    enc.to_nodes_.push_back(empty.node);
-    enc.graph_deps_key_ += from.id;
+    enc.graph_key_ += from.id;
-    enc.graph_deps_key_ += "-";
+    enc.graph_key_ += from.node_type;
-    enc.graph_deps_key_ += empty.id;
+    enc.graph_key_ += empty.id;
-    enc.graph_deps_key_ += "-";
+    enc.graph_key_ += empty.node_type;
  }
  // Insert the input -> concurrent node dependencies without updating output
@@ -140,6 +140,9 @@ CommandEncoder::ConcurrentContext::~ConcurrentContext() {
 }
 void CommandEncoder::insert_graph_dependencies(GraphNode node) {
  if (node.node_type == 'G') {
    graph_node_count_++;
  }
  node.id = std::to_string(node_count_++);
  if (in_concurrent_) {
    concurrent_nodes_.push_back(std::move(node));
@@ -151,10 +154,6 @@ void CommandEncoder::insert_graph_dependencies(GraphNode node) {
 }
 void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
  for (auto& node : nodes) {
    graph_nodes_key_ += node.node_type;
    graph_nodes_key_ += "-";
  }
  std::vector<GraphNode> deps;
  {
    // Dependencies must be added in the same order to produce a consistent
@@ -182,49 +181,20 @@ void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
    for (auto& to : nodes) {
      from_nodes_.push_back(from.node);
      to_nodes_.push_back(to.node);
-      graph_deps_key_ += from.id;
+      graph_key_ += from.id;
-      graph_deps_key_ += "-";
+      graph_key_ += from.node_type;
-      graph_deps_key_ += to.id;
+      graph_key_ += to.id;
-      graph_deps_key_ += "-";
+      graph_key_ += to.node_type;
    }
  }
 }
 // Can be tuned with MLX_MAX_OPS_PER_BUFFER, MLX_MAX_MB_PER_BUFFER
 std::pair<int, int> get_graph_limits(Device& d) {
  auto cc =
      d.compute_capability_major() * 100 + d.compute_capability_minor() * 10;
  int ops = 20;
  int mb = 100;
  switch (cc) {
    case 800: // A100
      ops = 20;
      mb = 400;
      break;
    case 900: // H100
      ops = 30;
      mb = 400;
      break;
    case 1000: // B200
      ops = 50;
      mb = 500;
      break;
    case 1210: // DGX Spark
      ops = 20;
      mb = 25;
      break;
  }
  return {env::max_ops_per_buffer(ops), env::max_mb_per_buffer(mb)};
 }
 CommandEncoder::CommandEncoder(Device& d)
    : device_(d),
      stream_(d),
      graph_(d),
      worker_(d),
-      graph_cache_("MLX_CUDA_GRAPH_CACHE_SIZE", /* default_capacity */ 400) {
+      graph_cache_("MLX_CUDA_GRAPH_CACHE_SIZE", /* default_capacity */ 400) {}
  std::tie(max_ops_per_graph_, max_mb_per_graph_) = get_graph_limits(d);
 }
 void CommandEncoder::add_completed_handler(std::function<void()> task) {
  worker_.add_task(std::move(task));
@@ -234,7 +204,6 @@ void CommandEncoder::set_input_array(const array& arr) {
  if (!use_cuda_graphs()) {
    return;
  }
  bytes_in_graph_ += arr.data_size();
  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
  active_deps_.push_back(id);
 }
@@ -309,76 +278,13 @@ void CommandEncoder::add_kernel_node(
 void CommandEncoder::add_kernel_node(const cudaKernelNodeParams& params) {
  cudaGraphNode_t node;
  CHECK_CUDA_ERROR(cudaGraphAddKernelNode(&node, graph_, NULL, 0, &params));
-  insert_graph_dependencies(GraphNode{node, "K"});
+  insert_graph_dependencies(GraphNode{node, 'K'});
 }
 void CommandEncoder::add_kernel_node(const CUDA_KERNEL_NODE_PARAMS& params) {
  CUgraphNode node;
  CHECK_CUDA_ERROR(cuGraphAddKernelNode(&node, graph_, NULL, 0, &params));
-  insert_graph_dependencies(GraphNode{node, "K"});
+  insert_graph_dependencies(GraphNode{node, 'K'});
 }
 std::pair<std::string, bool> subgraph_to_key(cudaGraph_t graph) {
  // Constructs a key representing the nodes of a sub-graph.
  // Also checks if the sub-graph is updatable as CUDA graphs do not get
  // updated correctly if a kernel node getting updated has a different cluster
  // shape than the node it's being updated with.
  std::string key = "(";
  size_t num_nodes = 0;
  CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, nullptr, &num_nodes));
  if (num_nodes == 0) {
    return {key + ")", true};
  }
  bool is_updatable = true;
  std::vector<cudaGraphNode_t> nodes(num_nodes);
  CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, nodes.data(), &num_nodes));
  for (const auto& node : nodes) {
    if (!is_updatable) {
      break;
    }
    cudaGraphNodeType type;
    CHECK_CUDA_ERROR(cudaGraphNodeGetType(node, &type));
    switch (type) {
      case cudaGraphNodeTypeGraph: {
        // Try to be updatable for a structure like graph -> graph -> kernel
        cudaGraph_t child;
        CHECK_CUDA_ERROR(cudaGraphChildGraphNodeGetGraph(node, &child));
        auto [subkey, sub_is_updatable] = subgraph_to_key(child);
        is_updatable &= sub_is_updatable;
        key += subkey;
        break;
      }
      case cudaGraphNodeTypeHost:
        key += "H";
        break;
      case cudaGraphNodeTypeMemset:
        key += "M";
        break;
      case cudaGraphNodeTypeKernel: {
        cudaLaunchAttributeValue cluster_dim;
        CHECK_CUDA_ERROR(cudaGraphKernelNodeGetAttribute(
            node, cudaLaunchAttributeClusterDimension, &cluster_dim));
        // Only allow dim.x to be greater than 1
        if (cluster_dim.clusterDim.y > 1 || cluster_dim.clusterDim.z > 1) {
          is_updatable = false;
        } else {
          key += "K";
          key += std::to_string(cluster_dim.clusterDim.x);
        }
        break;
      }
      case cudaGraphNodeTypeWaitEvent:
        key += "W";
        break;
      case cudaGraphNodeTypeEventRecord:
        key += "R";
        break;
      default:
        is_updatable = false;
    }
  }
  key += ")";
  return {key, is_updatable};
 }
 void CommandEncoder::add_graph_node(cudaGraph_t child) {
@@ -391,15 +297,12 @@ void CommandEncoder::add_graph_node(cudaGraph_t child) {
    return;
  }
  cudaGraphNode_t node;
  auto [sub_graph_key, is_updatable] = subgraph_to_key(child);
  is_graph_updatable_ &= is_updatable;
  CHECK_CUDA_ERROR(cudaGraphAddChildGraphNode(&node, graph_, NULL, 0, child));
-  insert_graph_dependencies(GraphNode{node, sub_graph_key});
+  insert_graph_dependencies(GraphNode{node, 'G'});
 }
-bool CommandEncoder::needs_commit() {
+int CommandEncoder::get_num_ops() {
-  return (node_count_ > max_ops_per_graph_) ||
+  return node_count_;
      ((bytes_in_graph_ >> 20) > max_mb_per_graph_);
 }
 void CommandEncoder::commit() {
@@ -419,59 +322,49 @@ void CommandEncoder::commit() {
          from_nodes_.size()));
    }
-    device_.make_current();
+    graph_key_ += ".";
    graph_key_ += std::to_string(node_count_);
    graph_key_ += ".";
    graph_key_ += std::to_string(graph_node_count_);
    graph_key_ += ".";
    graph_key_ += std::to_string(empty_node_count_);
-    if (!is_graph_updatable_) {
+    CudaGraphExec& graph_exec = graph_cache_[graph_key_];
      CudaGraphExec graph_exec;
      graph_exec.instantiate(graph_);
      CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
    } else {
      auto graph_key = graph_nodes_key_ + ":" + graph_deps_key_;
      auto& graph_exec = graph_cache_[graph_key];
-      if (graph_exec != nullptr) {
+    if (graph_exec != nullptr) {
-        cudaGraphExecUpdateResult update_result;
+      cudaGraphExecUpdateResult update_result;
 #if CUDART_VERSION >= 12000
-        cudaGraphExecUpdateResultInfo info;
+      cudaGraphExecUpdateResultInfo info;
-        cudaGraphExecUpdate(graph_exec, graph_, &info);
+      cudaGraphExecUpdate(graph_exec, graph_, &info);
-        update_result = info.result;
+      update_result = info.result;
 #else
-        cudaGraphNode_t error_node;
+      cudaGraphNode_t error_node;
-        cudaGraphExecUpdate(graph_exec, graph_, &error_node, &update_result);
+      cudaGraphExecUpdate(graph_exec, graph_, &error_node, &update_result);
 #endif // CUDART_VERSION >= 12000
-        if (update_result != cudaGraphExecUpdateSuccess) {
+      if (update_result != cudaGraphExecUpdateSuccess) {
-          cudaGetLastError(); // reset error
+        cudaGetLastError(); // reset error
-          graph_exec.reset();
+        graph_exec.reset();
        }
      }
      if (graph_exec == nullptr) {
        graph_exec.instantiate(graph_);
      }
      CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
    }
-
+    if (graph_exec == nullptr) {
-    // Save cuda graph to dot file
+      graph_exec.instantiate(graph_);
    if (const char* filename = save_cuda_graphs_dot_file(); filename) {
      static int count = 0;
      auto path = fmt::format("{}_{}.dot", filename, ++count);
      CHECK_CUDA_ERROR(cudaGraphDebugDotPrint(graph_, path.c_str(), 0));
    }
    device_.make_current();
    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
    // Reset state
    graph_node_count_ = 0;
    empty_node_count_ = 0;
    from_nodes_.clear();
    to_nodes_.clear();
-    graph_deps_key_.clear();
+    graph_key_.clear();
    graph_nodes_key_.clear();
    node_map_.clear();
    graph_ = CudaGraph(device_);
    is_graph_updatable_ = true;
  }
  // Put completion handlers in a batch.
  worker_.commit(stream_);
  node_count_ = 0;
  bytes_in_graph_ = 0;
 }
 void CommandEncoder::synchronize() {
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@@ -84,7 +84,7 @@ class CommandEncoder {
  }
  void add_completed_handler(std::function<void()> task);
-  bool needs_commit();
+  int get_num_ops();
  void commit();
  Device& device() {
@@ -106,9 +106,8 @@ class CommandEncoder {
    cudaGraphNode_t node;
    // K = kernel
    // E = empty
-    // () = subgraph (with metadata)
+    // G = subgraph
-    // Symbols ':', '-' are reserved as separators
+    char node_type;
    std::string node_type;
    std::string id;
  };
@@ -120,21 +119,18 @@ class CommandEncoder {
  CudaGraph graph_;
  Worker worker_;
  char node_count_{0};
  char graph_node_count_{0};
  char empty_node_count_{0};
  bool in_concurrent_{false};
  std::vector<cudaGraphNode_t> from_nodes_;
  std::vector<cudaGraphNode_t> to_nodes_;
-  std::string graph_nodes_key_;
+  std::string graph_key_;
  std::string graph_deps_key_;
  std::vector<GraphNode> concurrent_nodes_;
  std::vector<std::shared_ptr<array::Data>> temporaries_;
  LRUCache<std::string, CudaGraphExec> graph_cache_;
  std::vector<std::uintptr_t> active_deps_;
  std::vector<std::uintptr_t> active_outputs_;
  std::unordered_map<std::uintptr_t, GraphNode> node_map_;
  size_t bytes_in_graph_{0};
  bool is_graph_updatable_{true};
  int max_ops_per_graph_;
  int max_mb_per_graph_;
 };
 class Device {
@@ -170,7 +166,6 @@ class Device {
  int device_;
  int compute_capability_major_;
  int compute_capability_minor_;
  std::string device_name_;
  cublasLtHandle_t lt_;
  cudnnHandle_t cudnn_;
  std::unordered_map<int, CommandEncoder> encoders_;
--- a/mlx/backend/cuda/distributed.cu
+++ b/mlx/backend/cuda/distributed.cu
@@ -26,7 +26,7 @@ void AllReduce::eval_gpu(
      out.copy_shared_buffer(in);
      return {in, out};
    } else {
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
      return {in, out};
    }
  };
@@ -74,7 +74,7 @@ void AllGather::eval_gpu(
  };
  auto input = ensure_contiguous(inputs[0]);
-  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder));
+  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder.stream()));
  encoder.set_input_array(input);
  encoder.set_output_array(outputs[0]);
@@ -103,7 +103,7 @@ void ReduceScatter::eval_gpu(
  };
  auto input = ensure_contiguous(inputs[0]);
-  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder));
+  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder.stream()));
  encoder.set_input_array(input);
  encoder.set_output_array(outputs[0]);
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@@ -11,6 +11,9 @@
 namespace mlx::core::gpu {
 // Can be tuned with MLX_MAX_OPS_PER_BUFFER
 constexpr int default_max_nodes_per_graph = 20;
 bool is_available() {
  return true;
 }
@@ -50,7 +53,8 @@ void eval(array& arr) {
    encoder.add_temporary(s);
  }
-  if (encoder.needs_commit()) {
+  if (encoder.get_num_ops() >=
      env::max_ops_per_buffer(default_max_nodes_per_graph)) {
    scheduler::notify_new_task(stream);
    encoder.add_completed_handler(
        [stream]() { scheduler::notify_task_completion(stream); });
--- a/mlx/backend/cuda/event.cu
+++ b/mlx/backend/cuda/event.cu
@@ -305,7 +305,6 @@ void Event::wait() {
  } else {
    event->atomic->wait(value());
  }
  CHECK_CUDA_ERROR(cudaPeekAtLastError());
 }
 void Event::wait(Stream s) {
--- a/mlx/backend/cuda/gemms/cublas_gemm.cpp
+++ b/mlx/backend/cuda/gemms/cublas_gemm.cpp
@@ -370,7 +370,7 @@ void CublasGemm::execute(
    // Ensure workspace is 256-byte aligned
    int nbytes = cuda::ceil_div(heuristic_.workspaceSize, 256) * 256;
    array workspace(
-        cu::malloc_async(nbytes, encoder),
+        cu::malloc_async(nbytes, encoder.stream()),
        {static_cast<int>(heuristic_.workspaceSize)},
        int8);
    encoder.add_temporary(workspace);
--- a/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
+++ b/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
@@ -163,7 +163,7 @@ void CublasGemm::run_batched(
  // Launch kernel to set device offsets
  auto pointers = array(
-      cu::malloc_async(batch_count * sizeof(void*) * 3, encoder),
+      cu::malloc_async(batch_count * sizeof(void*) * 3, encoder.stream()),
      {batch_count * 3},
      uint64);
@@ -251,7 +251,7 @@ void CublasGemm::run_batched(
  // Launch kernel to set device offsets
  auto pointers = array(
-      cu::malloc_async(batch_count * sizeof(uint64_t) * 4, encoder),
+      cu::malloc_async(batch_count * sizeof(uint64_t) * 4, encoder.stream()),
      {batch_count * 4},
      uint64);
--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@@ -61,7 +61,7 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  if (out.size() == 0) {
    return;
  }
@@ -241,7 +241,7 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  if (out.size() == 0) {
    return;
  }
--- a/mlx/backend/cuda/jit_module.cpp
+++ b/mlx/backend/cuda/jit_module.cpp
@@ -279,14 +279,11 @@ void compile(
  // Compile program.
  std::vector<const char*> args;
  bool use_sass = compiler_supports_device_sass(device);
  auto cc = device.compute_capability_major();
  std::string arch_tag = (cc == 90 || cc == 100 || cc == 121) ? "a" : "";
  std::string compute = fmt::format(
-      "--gpu-architecture={}_{}{}{}",
+      "--gpu-architecture={}_{}{}",
      use_sass ? "sm" : "compute",
-      cc,
+      device.compute_capability_major(),
-      device.compute_capability_minor(),
+      device.compute_capability_minor());
      arch_tag);
  args.push_back(compute.c_str());
  std::string cccl_include = cccl_dir();
  if (!cccl_include.empty()) {
--- a/mlx/backend/cuda/layer_norm.cu
+++ b/mlx/backend/cuda/layer_norm.cu
@@ -244,7 +244,7 @@ void LayerNorm::eval_gpu(
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
-            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
+            cu::malloc_async(x.data_size() * x.itemsize(), encoder.stream()),
            x.data_size(),
            x.strides(),
            x.flags());
@@ -335,7 +335,7 @@ void LayerNormVJP::eval_gpu(
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
-    gx.set_data(cu::malloc_async(gx.nbytes(), encoder));
+    gx.set_data(cu::malloc_async(gx.nbytes(), encoder.stream()));
  }
  if (g_copied && !g_in_gx) {
    encoder.add_temporary(g);
@@ -355,7 +355,7 @@ void LayerNormVJP::eval_gpu(
      g_in_gw = true;
      gw_temp.copy_shared_buffer(g);
    } else {
-      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder));
+      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder.stream()));
      encoder.add_temporary(gw_temp);
    }
  }
--- a/mlx/backend/cuda/load.cpp
+++ b/mlx/backend/cuda/load.cpp
@@ -32,7 +32,7 @@ void Load::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(stream());
  auto size = out.size();
  auto nbytes = size * out.itemsize();
-  out.set_data(cu::malloc_async(nbytes, encoder));
+  out.set_data(cu::malloc_async(nbytes, encoder.stream()));
  auto out_ptr = malloc(nbytes);
  reader_->read(static_cast<char*>(out_ptr), nbytes, offset_);
  if (swap_endianness_) {
--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -115,7 +115,7 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto in = ensure_contiguous(inputs[0]);
  if (in.flags().row_contiguous) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  } else {
    auto n = in.shape(-1);
    auto flags = in.flags();
@@ -130,7 +130,7 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
    }
    flags.col_contiguous = col_contig;
    out.set_data(
-        cu::malloc_async(in.nbytes() / n, encoder),
+        cu::malloc_async(in.nbytes() / n, encoder.stream()),
        in.data_size() / n,
        std::move(strides),
        flags);
--- a/mlx/backend/cuda/matmul.cpp
+++ b/mlx/backend/cuda/matmul.cpp
@@ -121,7 +121,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    return;
  }
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  int M = a_pre.shape(-2);
  int N = b_pre.shape(-1);
@@ -163,7 +163,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  if (beta_ == 1 && a.dtype() != complex64 && c.strides(-1) == 1 &&
      c.data_size() == out.shape(-1)) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    gemm_and_bias(
        encoder,
        M,
@@ -187,10 +187,10 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    auto sty = c.strides()[c.ndim() - 1];
    if (sty == 1 && stx == c.shape(-1)) {
      ldc = stx;
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    } else if (sty == 1 && stx == 0) {
      ldc = 0;
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    } else {
      // Copy C into out and set C to out
      ldc = c.shape(-1);
--- a/mlx/backend/cuda/primitives.cpp
+++ b/mlx/backend/cuda/primitives.cpp
@@ -37,7 +37,6 @@ NO_GPU(Inverse)
 NO_GPU(Cholesky)
 NO_GPU_MULTI(Eig)
 NO_GPU_MULTI(Eigh)
 NO_GPU(MaskedScatter)
 namespace distributed {
 NO_GPU_MULTI(Send)
--- a/mlx/backend/cuda/quantized/fp_quantize.cu
+++ b/mlx/backend/cuda/quantized/fp_quantize.cu
@@ -2,11 +2,7 @@
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/cuda/quantized/mxfp8_quantize.cuh"
 #include "mlx/backend/cuda/quantized/nvfp4_quantize.cuh"
 #include "mlx/backend/cuda/quantized/quantized.h"
 #include "mlx/backend/cuda/quantized/quantized_utils.cuh"
 #include "mlx/backend/cuda/vector_types.cuh"
 #include "mlx/dtype_utils.h"
 #include <cooperative_groups.h>
@@ -17,6 +13,17 @@
 namespace mlx::core {
 namespace cu {
 template <int bits>
 struct Quantize {
  __device__ uint8_t operator()(float x) {
    if constexpr (bits == 8) {
      return __nv_fp8_e4m3(x).__x;
    } else {
      return __nv_fp4_e2m1(x).__x;
    }
  }
 };
 template <int bits>
 struct Dequantize {
  __device__ float operator()(uint8_t x) {
@@ -30,40 +37,29 @@ struct Dequantize {
 namespace cg = cooperative_groups;
-template <typename T, int group_size, int bits, bool use_mx_scale, bool USE_SR>
+template <typename T, int group_size, int bits, bool use_mx_scale>
-__global__ void fp_quantize(T* w, uint8_t* out, uint8_t* scales, size_t size) {
+__global__ void
-  using Tx2 = Vector2_t<T>;
+fp_quantize(const T* w, uint8_t* out, uint8_t* scales, size_t size) {
  using Tx4 = Vector4_t<T>;
  uint32_t rbits = 0; // reserved bits for future use
  auto block_size = cg::this_thread_block().dim_threads();
  auto block_idx = cg::this_thread_block().group_index();
  auto idx_in_block = cg::this_thread_block().thread_index();
  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
  auto tidy = block_idx.y * block_size.y + idx_in_block.y;
  auto grid_dim_x = cg::this_grid().dim_blocks().x * block_size.x;
-  size_t thread_idx = tidx + grid_dim_x * size_t(tidy);
+  auto grid_dim_x =
-  size_t base_idx = thread_idx * group_size;
+      cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
-
+  size_t index = tidx + grid_dim_x * size_t(tidy);
-  if (base_idx >= size) {
+  if (index >= size) {
    return;
  }
-  auto w_tile = load_vector<group_size, T>(w, thread_idx);
+  float w_thread = w[index];
  float scale = 0.0f;
-  Tx2 amax_2x = Tx2{0.0f, 0.0f};
+  cg::greater<float> max_op;
-
+  auto warp = cg::tiled_partition<group_size>(cg::this_thread_block());
 #pragma unroll
  for (int i = 0; i < group_size; i += 2) {
    auto pair = Tx2{w_tile[i], w_tile[i + 1]};
    abs_max_x2<Tx2>(amax_2x, amax_2x, pair);
  }
  scale = static_cast<float>(
      max(fabsf(static_cast<float>(amax_2x.x)),
          fabsf(static_cast<float>(amax_2x.y))));
  float scale = cg::reduce(warp, abs(w_thread), max_op);
  scale /= bits == 4 ? 6.0f : 448.0f;
  // Convert to mx scale or nv scale
  using ScaleType =
@@ -72,24 +68,21 @@ __global__ void fp_quantize(T* w, uint8_t* out, uint8_t* scales, size_t size) {
  uint8_t q_scale = s.__x;
  scale = float(s);
-  scales[thread_idx] = q_scale;
+  // Write out the scales
-  constexpr int elem_per_byte = bits == 8 ? 1 : 2;
+  size_t gindex = index / group_size;
-  AlignedVector<uint8_t, group_size / elem_per_byte> quantized;
+  if (index % group_size == 0) {
-
+    scales[gindex] = q_scale;
-#pragma unroll
+  }
-  for (int i = 0; i < group_size / 4; i++) {
+
-    Tx4 w_Tx4 = *reinterpret_cast<Tx4*>(&w_tile[i * 4]);
+  uint8_t output = Quantize<bits>{}(scale == 0 ? 0.0f : w_thread / scale);
-    if constexpr (bits == 8) {
+  if (bits == 4) {
-      uint32_t quantized_val =
+    uint8_t sval = warp.shfl_down(output, 1);
-          scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+    output |= sval << bits;
-      *reinterpret_cast<uint32_t*>(&quantized[i * 4]) = quantized_val;
+  }
-    } else {
+  constexpr int pack_factor = bits == 8 ? 1 : 2;
-      uint16_t quantized_val =
+  if (index % pack_factor == 0) {
-          scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+    out[index / pack_factor] = output;
      *reinterpret_cast<uint16_t*>(&quantized[i * 2]) = quantized_val;
    }
  }
  store_vector<group_size / elem_per_byte>(out, thread_idx, quantized);
 }
 template <typename T, int group_size, int bits, bool use_mx_scale>
@@ -149,16 +142,15 @@ void fp_quantize(
  dispatch_float_types(w.dtype(), "fp_quantize", [&](auto type_tag) {
    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    if constexpr (!std::is_same_v<T, double>) {
-      auto kernel = cu::fp_quantize<T, 32, 4, true, false>;
+      auto kernel = cu::fp_quantize<T, 32, 4, true>;
      if (bits == 8) {
-        kernel = cu::fp_quantize<T, 32, 8, true, false>;
+        kernel = cu::fp_quantize<T, 32, 8, true>;
      } else if (group_size == 16) {
-        kernel = cu::fp_quantize<T, 16, 4, false, false>;
+        kernel = cu::fp_quantize<T, 16, 4, false>;
      }
      bool large = w.size() > UINT_MAX;
      auto [num_blocks, block_dims] =
-          get_launch_args(w.size(), w.shape(), w.strides(), large, group_size);
+          get_launch_args(w.size(), w.shape(), w.strides(), large);
      enc.add_kernel_node(
          kernel,
          num_blocks,
--- a/mlx/backend/cuda/quantized/mxfp8_quantize.cuh
+++ b/mlx/backend/cuda/quantized/mxfp8_quantize.cuh
@@ -1,32 +0,0 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_fp8.h>
 #include <cuda_runtime.h>
 #include "mlx/backend/cuda/vector_types.cuh"
 namespace mlx::core::cu {
 // TODO implement fast path
 template <typename T>
 __device__ __forceinline__ uint32_t
 scale_cvt_Tx4_to_fp8x4_fallback(const Vector4_t<T> input, const float scale) {
  uint32_t out_fp8x4 = 0;
  float4 scaled;
  scaled.x = static_cast<float>(input.x) * scale;
  scaled.y = static_cast<float>(input.y) * scale;
  scaled.z = static_cast<float>(input.z) * scale;
  scaled.w = static_cast<float>(input.w) * scale;
  out_fp8x4 = __nv_fp8x4_e4m3(scaled).__x;
  return out_fp8x4;
 }
 // Place holder for future fast path implementation
 template <typename T, bool USE_SR>
 __device__ __forceinline__ uint32_t scale_cvt_Tx4_to_fp8x4(
    const Vector4_t<T> input,
    const float scale,
    uint32_t rbits) {
  return scale_cvt_Tx4_to_fp8x4_fallback(input, scale);
 }
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/quantized/nvfp4_quantize.cuh
+++ b/mlx/backend/cuda/quantized/nvfp4_quantize.cuh
@@ -1,334 +0,0 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_fp4.h>
 #include <cuda_runtime.h>
 #include "mlx/backend/cuda/vector_types.cuh"
 namespace mlx::core::cu {
 using bf16x4 = Vector4_t<__nv_bfloat16>;
 using fp16x4 = Vector4_t<__half>;
 using f32x4 = Vector4_t<float>;
 template <typename T>
 __device__ __forceinline__ uint16_t
 scale_cvt_Tx4_to_fp4x4_fallback(const Vector4_t<T> input, const float scale) {
  // Fallback implementation for architectures that do not support cvt
  // instructions or for cuda versions with no fp4 support (< 12.8) -> scalar
  uint16_t out_fp4x4 = 0;
  fp32x4 scaled;
  scaled.x = static_cast<float>(input.x) * scale;
  scaled.y = static_cast<float>(input.y) * scale;
  scaled.z = static_cast<float>(input.z) * scale;
  scaled.w = static_cast<float>(input.w) * scale;
  uint8_t q0 = __nv_fp4_e2m1(scaled.x).__x;
  uint8_t q1 = __nv_fp4_e2m1(scaled.y).__x;
  uint8_t q2 = __nv_fp4_e2m1(scaled.z).__x;
  uint8_t q3 = __nv_fp4_e2m1(scaled.w).__x;
  out_fp4x4 = (static_cast<uint16_t>(q3) << 12) |
      (static_cast<uint16_t>(q2) << 8) | (static_cast<uint16_t>(q1) << 4) |
      static_cast<uint16_t>(q0);
  return out_fp4x4;
 }
 #if (CUDART_VERSION >= 12080) && (__CUDA_ARCH__ >= 1000) && \
    defined(__CUDA_ARCH_SPECIFIC__)
 __device__ __forceinline__ uint16_t
 scale_cvt_bf16x4_to_fp4x4_rn(const bf16x4 input_bf16x4, const float2 scale) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b16 x0_bf16; \n\t" // first bf16
      ".reg.b16 x1_bf16; \n\t" // second bf16
      ".reg.b16 x2_bf16; \n\t" // third bf16
      ".reg.b16 x3_bf16; \n\t" // fourth bf16
      ".reg.b32 x0; \n\t" // to hold scaled first
      ".reg.b32 x1; \n\t" // to hold scaled second
      ".reg.b32 x2; \n\t" // to hold scaled third
      ".reg.b32 x3; \n\t" // to hold scaled fourth
      ".reg.b64 x01; \n\t" // to hold vector mul
      ".reg.b64 x23; \n\t"
      ".reg.b8 q0; \n\t" // output byte fp4x2 (first pair)
      ".reg.b8 q1; \n\t" // output byte fp4x2 (second pair)
      "mov.b64 {x0_bf16, x1_bf16, x2_bf16, x3_bf16} , %1; \n\t" // unpack bf16
      "cvt.f32.bf16 x0, x0_bf16; \n\t" // convert to f32
      "cvt.f32.bf16 x1, x1_bf16; \n\t"
      "cvt.f32.bf16 x2, x2_bf16; \n\t"
      "cvt.f32.bf16 x3, x3_bf16; \n\t"
      "mov.b64 x01, {x0, x1}; \n\t"
      "mul.f32x2 x01, x01, %2; \n\t" // scale first pair
      "mov.b64 x23, {x2, x3}; \n\t"
      "mul.f32x2 x23, x23, %2; \n\t" // scale second pair
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rn.satfinite.e2m1x2.f32 q0, x1, x0; \n\t" // convert to fp4x2 first
                                                     // pair
      "cvt.rn.satfinite.e2m1x2.f32 q1, x3, x2; \n\t" // convert to fp4x2 second
                                                     // pair
      "mov.b16 %0, {q0, q1}; \n\t" // pack to output
      "}"
      : "=h"(out_fp4x4)
      : "l"(reinterpret_cast<const uint64_t&>(input_bf16x4)),
        "l"(reinterpret_cast<const uint64_t&>(
            scale))); // here cast is needed becuase an asm operand must have
                      // scalar type
  return out_fp4x4;
 }
 __device__ __forceinline__ uint16_t scale_cvt_bf16x4_to_fp4x4_rs(
    const bf16x4 input_bf16x4,
    const float2 scale,
    uint32_t rbits) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b16 x0_bf16; \n\t"
      ".reg.b16 x1_bf16; \n\t"
      ".reg.b16 x2_bf16; \n\t"
      ".reg.b16 x3_bf16; \n\t"
      ".reg.b32 x0; \n\t"
      ".reg.b32 x1; \n\t"
      ".reg.b32 x2; \n\t"
      ".reg.b32 x3; \n\t"
      ".reg.b64 x01; \n\t"
      ".reg.b64 x23; \n\t"
      ".reg.b16 q0; \n\t"
      "mov.b64 {x0_bf16, x1_bf16, x2_bf16, x3_bf16} , %1; \n\t"
      "cvt.f32.bf16 x0, x0_bf16; \n\t"
      "cvt.f32.bf16 x1, x1_bf16; \n\t"
      "cvt.f32.bf16 x2, x2_bf16; \n\t"
      "cvt.f32.bf16 x3, x3_bf16; \n\t"
      "mov.b64 x01, {x0, x1}; \n\t"
      "mul.f32x2 x01, x01, %2; \n\t"
      "mov.b64 x23, {x2, x3}; \n\t"
      "mul.f32x2 x23, x23, %2; \n\t"
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rs.satfinite.e2m1x4.f32 q0, {x3, x2, x1, x0}, %3; \n\t"
      "}"
      : "=h"(out_fp4x4)
      : "l"(reinterpret_cast<const uint64_t&>(input_bf16x4)),
        "l"(reinterpret_cast<const uint64_t&>(scale)),
        "r"(rbits));
  return out_fp4x4;
 }
 __device__ __forceinline__ uint16_t scale_cvt_fp32x4_to_fp4x4_rn(
    const float2 input_fp32x2_0,
    const float2 input_fp32x2_1,
    const float2 scale) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b32 x0; \n\t"
      ".reg.b32 x1; \n\t"
      ".reg.b32 x2; \n\t"
      ".reg.b32 x3; \n\t"
      ".reg.b64 x01; \n\t"
      ".reg.b64 x23; \n\t"
      ".reg.b8 q0; \n\t"
      ".reg.b8 q1; \n\t"
      "mov.b64 x01, {%1, %2}; \n\t"
      "mul.f32x2 x01, x01, %5; \n\t"
      "mov.b64 x23, {%3, %4}; \n\t"
      "mul.f32x2 x23, x23, %5; \n\t"
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rn.satfinite.e2m1x2.f32 q0, x1, x0; \n\t"
      "cvt.rn.satfinite.e2m1x2.f32 q1, x3, x2; \n\t"
      "mov.b16 %0, {q0, q1}; \n\t"
      "}"
      : "=h"(out_fp4x4)
      : "f"(input_fp32x2_0.x),
        "f"(input_fp32x2_0.y),
        "f"(input_fp32x2_1.x),
        "f"(input_fp32x2_1.y),
        "l"(reinterpret_cast<const uint64_t&>(scale)));
  return out_fp4x4;
 }
 __device__ __forceinline__ uint16_t scale_cvt_fp32x4_to_fp4x4_rs(
    const float2 input_fp32x2_0,
    const float2 input_fp32x2_1,
    const float2 scale,
    uint32_t rbits) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b32 x0; \n\t"
      ".reg.b32 x1; \n\t"
      ".reg.b32 x2; \n\t"
      ".reg.b32 x3; \n\t"
      ".reg.b64 x01; \n\t"
      ".reg.b64 x23; \n\t"
      ".reg.b16 q0; \n\t"
      "mov.b64 x01, {%1, %2}; \n\t"
      "mul.f32x2 x01, x01, %5; \n\t"
      "mov.b64 x23, {%3, %4}; \n\t"
      "mul.f32x2 x23, x23, %5; \n\t"
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rs.satfinite.e2m1x4.f32 q0, {x3, x2, x1, x0}, %6; \n\t"
      "}"
      : "=h"(out_fp4x4)
      : "f"(input_fp32x2_0.x),
        "f"(input_fp32x2_0.y),
        "f"(input_fp32x2_1.x),
        "f"(input_fp32x2_1.y),
        "l"(reinterpret_cast<const uint64_t&>(scale)),
        "r"(rbits));
  return out_fp4x4;
 }
 __device__ __forceinline__ uint16_t
 scale_cvt_fp16x4_to_fp4x4_rn(const fp16x4 input_fp16x4, const float2 scale) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b16 x0_fp16; \n\t"
      ".reg.b16 x1_fp16; \n\t"
      ".reg.b16 x2_fp16; \n\t"
      ".reg.b16 x3_fp16; \n\t"
      ".reg.b32 x0; \n\t"
      ".reg.b32 x1; \n\t"
      ".reg.b32 x2; \n\t"
      ".reg.b32 x3; \n\t"
      ".reg.b64 x01; \n\t"
      ".reg.b64 x23; \n\t"
      ".reg.b8 q0; \n\t"
      ".reg.b8 q1; \n\t"
      "mov.b64 {x0_fp16, x1_fp16, x2_fp16, x3_fp16} , %1; \n\t"
      "cvt.f32.f16 x0, x0_fp16; \n\t"
      "cvt.f32.f16 x1, x1_fp16; \n\t"
      "cvt.f32.f16 x2, x2_fp16; \n\t"
      "cvt.f32.f16 x3, x3_fp16; \n\t"
      "mov.b64 x01, {x0, x1}; \n\t"
      "mul.f32x2 x01, x01, %2; \n\t"
      "mov.b64 x23, {x2, x3}; \n\t"
      "mul.f32x2 x23, x23, %2; \n\t"
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rn.satfinite.e2m1x2.f32 q0, x1, x0; \n\t"
      "cvt.rn.satfinite.e2m1x2.f32 q1, x3, x2; \n\t"
      "mov.b16 %0, {q0, q1}; \n\t"
      "}"
      : "=h"(out_fp4x4)
      : "l"(reinterpret_cast<const uint64_t&>(input_fp16x4)),
        "l"(reinterpret_cast<const uint64_t&>(scale)));
  return out_fp4x4;
 }
 __device__ __forceinline__ uint16_t scale_cvt_fp16x4_to_fp4x4_rs(
    const fp16x4 input_fp16x4,
    const float2 scale,
    uint32_t rbits) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b16 x0_fp16; \n\t"
      ".reg.b16 x1_fp16; \n\t"
      ".reg.b16 x2_fp16; \n\t"
      ".reg.b16 x3_fp16; \n\t"
      ".reg.b32 x0; \n\t"
      ".reg.b32 x1; \n\t"
      ".reg.b32 x2; \n\t"
      ".reg.b32 x3; \n\t"
      ".reg.b64 x01; \n\t"
      ".reg.b64 x23; \n\t"
      ".reg.b16 q0; \n\t"
      "mov.b64 {x0_fp16, x1_fp16, x2_fp16, x3_fp16} , %1; \n\t"
      "cvt.f32.f16 x0, x0_fp16; \n\t"
      "cvt.f32.f16 x1, x1_fp16; \n\t"
      "cvt.f32.f16 x2, x2_fp16; \n\t"
      "cvt.f32.f16 x3, x3_fp16; \n\t"
      "mov.b64 x01, {x0, x1}; \n\t"
      "mul.f32x2 x01, x01, %2; \n\t"
      "mov.b64 x23, {x2, x3}; \n\t"
      "mul.f32x2 x23, x23, %2; \n\t"
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rs.satfinite.e2m1x4.f32 q0, {x3, x2, x1, x0}, %3; \n\t"
      "}"
      : "=h"(out_fp4x4)
      : "l"(reinterpret_cast<const uint64_t&>(input_fp16x4)),
        "l"(reinterpret_cast<const uint64_t&>(scale)),
        "r"(rbits));
  return out_fp4x4;
 }
 template <bool USE_SR>
 __device__ __forceinline__ uint16_t scale_cvt_bf16x4_to_fp4x4(
    const bf16x4 input,
    const float scale,
    uint32_t rbits) {
  float2 scale_fp32x2 = make_float2(scale, scale);
  if constexpr (USE_SR) {
    return scale_cvt_bf16x4_to_fp4x4_rs(input, scale_fp32x2, rbits);
  } else {
    return scale_cvt_bf16x4_to_fp4x4_rn(input, scale_fp32x2);
  }
 }
 template <bool USE_SR>
 __device__ __forceinline__ uint16_t scale_cvt_fp16x4_to_fp4x4(
    const fp16x4 input,
    const float scale,
    uint32_t rbits) {
  float2 scale_fp32x2 = make_float2(scale, scale);
  if constexpr (USE_SR) {
    return scale_cvt_fp16x4_to_fp4x4_rs(input, scale_fp32x2, rbits);
  } else {
    return scale_cvt_fp16x4_to_fp4x4_rn(input, scale_fp32x2);
  }
 }
 template <bool USE_SR>
 __device__ __forceinline__ uint16_t
 scale_cvt_f32x4_to_fp4x4(const f32x4 input, const float scale, uint32_t rbits) {
  float2 scale_fp32x2 = make_float2(scale, scale);
  float2 input_fp32x2_0 = make_float2(input.x, input.y);
  float2 input_fp32x2_1 = make_float2(input.z, input.w);
  if constexpr (USE_SR) {
    return scale_cvt_fp32x4_to_fp4x4_rs(
        input_fp32x2_0, input_fp32x2_1, scale_fp32x2, rbits);
  } else {
    return scale_cvt_fp32x4_to_fp4x4_rn(
        input_fp32x2_0, input_fp32x2_1, scale_fp32x2);
  }
 }
 template <typename T, bool USE_SR>
 __device__ __forceinline__ uint16_t scale_cvt_Tx4_to_fp4x4_fast(
    const Vector4_t<T> input,
    const float scale,
    uint32_t rbits) {
  if constexpr (std::is_same<T, __nv_bfloat16>::value) {
    return scale_cvt_bf16x4_to_fp4x4<USE_SR>(input, scale, rbits);
  } else if constexpr (std::is_same<T, __half>::value) {
    return scale_cvt_fp16x4_to_fp4x4<USE_SR>(input, scale, rbits);
  } else {
    return scale_cvt_f32x4_to_fp4x4<USE_SR>(input, scale, rbits);
  }
 }
 #endif // (CUDART_VERSION >= 12080) && (__CUDA_ARCH__ >= 1000) &&
       // (__CUDA_ARCH_FAMILY_SPECIFIC__ >= 1000)
 template <typename T, bool USE_SR>
 __device__ __forceinline__ uint16_t scale_cvt_Tx4_to_fp4x4(
    const Vector4_t<T> input,
    const float scale,
    uint32_t rbits) {
 #if (CUDART_VERSION >= 12080) && (__CUDA_ARCH__ >= 1000) && \
    (__CUDA_ARCH_FAMILY_SPECIFIC__ >= 1000)
  return scale_cvt_Tx4_to_fp4x4_fast<T, USE_SR>(input, scale, rbits);
 #else
  static_assert(
      !USE_SR,
      "Stochastic rounding (USE_SR=true) requires CUDA >= 12.8 and compute capability >= 1000.");
  return scale_cvt_Tx4_to_fp4x4_fallback(input, scale);
 #endif
 }
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/quantized/quantized.cpp
+++ b/mlx/backend/cuda/quantized/quantized.cpp
@@ -59,7 +59,7 @@ void fast::Quantize::eval_gpu(
    auto scales = ensure_row_contiguous(inputs[1], enc, s);
    auto& w = outputs[0];
-    w.set_data(cu::malloc_async(w.nbytes(), enc));
+    w.set_data(cu::malloc_async(w.nbytes(), enc.stream()));
    if (mode_ == QuantizationMode::Affine) {
      auto biases = ensure_row_contiguous(inputs[2], enc, s);
@@ -72,11 +72,11 @@ void fast::Quantize::eval_gpu(
    auto& wq = outputs[0];
    auto& scales = outputs[1];
-    wq.set_data(cu::malloc_async(wq.nbytes(), enc));
+    wq.set_data(cu::malloc_async(wq.nbytes(), enc.stream()));
-    scales.set_data(cu::malloc_async(scales.nbytes(), enc));
+    scales.set_data(cu::malloc_async(scales.nbytes(), enc.stream()));
    if (mode_ == QuantizationMode::Affine) {
      auto& biases = outputs[2];
-      biases.set_data(cu::malloc_async(biases.nbytes(), enc));
+      biases.set_data(cu::malloc_async(biases.nbytes(), enc.stream()));
      affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
    } else {
      fp_quantize(w, wq, scales, group_size_, bits_, enc, s);
--- a/mlx/backend/cuda/quantized/quantized_utils.cuh
+++ b/mlx/backend/cuda/quantized/quantized_utils.cuh
@@ -15,22 +15,6 @@ inline constexpr __device__ short get_bytes_per_pack() {
  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
 }
 template <typename T>
 __device__ __forceinline__ void abs_max_x2(T& out, const T& x1, const T& x2) {
  if constexpr (
      (std::is_same<T, __nv_bfloat162>::value) ||
      (std::is_same<T, __half2>::value)) {
    T a = x1;
    T b = x2;
    out = __hmax2(__habs2(a), __habs2(b));
  } else if constexpr (std::is_same<T, float2>::value) {
    float2 a = x1;
    float2 b = x2;
    out.x = fmaxf(fabsf(a.x), fabsf(b.x));
    out.y = fmaxf(fabsf(a.y), fabsf(b.y));
  }
 }
 } // namespace cu
 template <typename F>
--- a/mlx/backend/cuda/random.cu
+++ b/mlx/backend/cuda/random.cu
@@ -139,36 +139,30 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  // keys has shape (N1, ..., NK, 2)
  // out has shape (N1, ..., NK, M1, M2, ...)
  auto& keys = inputs[0];
-  size_t num_keys = keys.size() / 2;
+  uint32_t num_keys = keys.size() / 2;
-  size_t elems_per_key = out.size() / num_keys;
+  uint32_t elems_per_key = out.size() / num_keys;
-  size_t bytes_per_key = out.itemsize() * elems_per_key;
+  uint32_t bytes_per_key = out.itemsize() * elems_per_key;
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  if (out.size() == 0) {
    return;
  }
-  size_t out_per_key = (bytes_per_key + 4 - 1) / 4;
+  uint32_t out_per_key = (bytes_per_key + 4 - 1) / 4;
-  size_t half_size = out_per_key / 2;
+  uint32_t half_size = out_per_key / 2;
  bool odd = out_per_key % 2;
  if ((half_size + odd) >= UINT32_MAX || num_keys >= UINT32_MAX) {
    throw std::runtime_error("[RandomBits::eval_gpu] Large size unsupported");
  }
  encoder.set_input_array(keys);
  encoder.set_output_array(out);
-  int64_t total = num_keys * (half_size + odd);
+  dim3 grid_dims{num_keys, half_size + odd};
-  uint32_t threads_y = 1;
+  int64_t total = grid_dims.x * grid_dims.y;
-  while ((total / threads_y) >= UINT_MAX) {
+  int32_t threads_y = 1;
  while ((total / threads_y) >= (1U << 31)) {
    threads_y *= 2;
  }
-  uint32_t threads_x = cuda::ceil_div(total, threads_y);
+  int32_t threads_x = cuda::ceil_div(total, threads_y);
  dim3 grid_dims{
      static_cast<uint32_t>(num_keys), static_cast<uint32_t>(half_size + odd)};
  auto [grid, block] = get_grid_and_block(threads_x, threads_y, 1);
  auto& stream = encoder.stream();
  if (keys.flags().row_contiguous) {
--- a/mlx/backend/cuda/reduce/all_reduce.cu
+++ b/mlx/backend/cuda/reduce/all_reduce.cu
@@ -66,7 +66,7 @@ void all_reduce(
    Reduce::ReduceType reduce_type) {
  constexpr int N_READS = 8;
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  auto get_args = [](size_t size, int N) {
    int threads = std::min(512UL, (size + N - 1) / N);
@@ -107,7 +107,8 @@ void all_reduce(
  encoder.set_input_array(in);
  if (blocks > 1) {
    array intermediate({blocks}, out.dtype(), nullptr, {});
-    intermediate.set_data(cu::malloc_async(intermediate.nbytes(), encoder));
+    intermediate.set_data(
        cu::malloc_async(intermediate.nbytes(), encoder.stream()));
    encoder.add_temporary(intermediate);
    encoder.set_output_array(intermediate);
    dispatch_all_types(dt, [&](auto type_tag) {
--- a/mlx/backend/cuda/reduce/col_reduce.cu
+++ b/mlx/backend/cuda/reduce/col_reduce.cu
@@ -89,13 +89,9 @@ template <
    int NDIM,
    int BM,
    int BN,
-    int N_READS = 4,
+    int N_READS = 4>
-    int BLOCKS = 1>
+__global__ void
-__global__ void col_reduce_looped(
+col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
    T* in,
    U* out,
    const __grid_constant__ ColReduceArgs args,
    int64_t out_size) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);
@@ -106,8 +102,6 @@ __global__ void col_reduce_looped(
  size_t tile_idx = grid.block_rank();
  size_t tile_x = tile_idx % ((args.reduction_stride + BN - 1) / BN);
  size_t tile_y = tile_idx / ((args.reduction_stride + BN - 1) / BN);
  size_t tile_out = tile_y / out_size;
  tile_y = tile_y % out_size;
  // Compute the indices for the thread within the tile
  short thread_x = block.thread_rank() % threads_per_row;
@@ -124,23 +118,12 @@ __global__ void col_reduce_looped(
    totals[i] = ReduceInit<Op, T>::value();
  }
  size_t total = args.non_col_reductions * args.reduction_size;
  size_t per_block, start, end;
  if constexpr (BLOCKS > 1) {
    per_block = (total + BLOCKS - 1) / BLOCKS;
    start = tile_out * per_block + thread_y;
    end = min((tile_out + 1) * per_block, total);
  } else {
    per_block = total;
    start = thread_y;
    end = total;
  }
  LoopedElemToLoc<NDIM, (NDIM > 2)> loop(args.reduce_ndim);
-  loop.next(start, args.reduce_shape.data(), args.reduce_strides.data());
+  loop.next(thread_y, args.reduce_shape.data(), args.reduce_strides.data());
  size_t total = args.non_col_reductions * args.reduction_size;
  if (tile_x * BN + BN <= args.reduction_stride) {
    if (args.reduction_stride % N_READS == 0) {
-      for (size_t r = start; r < end; r += BM) {
+      for (size_t r = thread_y; r < total; r += BM) {
        T vals[N_READS];
        cub::LoadDirectBlockedVectorized(thread_x, in + loop.location(), vals);
        for (int i = 0; i < N_READS; i++) {
@@ -149,7 +132,7 @@ __global__ void col_reduce_looped(
        loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
      }
    } else {
-      for (size_t r = start; r < end; r += BM) {
+      for (size_t r = thread_y; r < total; r += BM) {
        T vals[N_READS];
        cub::LoadDirectBlocked(thread_x, in + loop.location(), vals);
        for (int i = 0; i < N_READS; i++) {
@@ -159,7 +142,7 @@ __global__ void col_reduce_looped(
      }
    }
  } else {
-    for (size_t r = start; r < end; r += BM) {
+    for (size_t r = thread_y; r < total; r += BM) {
      T vals[N_READS];
      cub::LoadDirectBlocked(
          thread_x,
@@ -190,9 +173,6 @@ __global__ void col_reduce_looped(
  // Write result.
  if (warp.thread_rank() == 0) {
    if (BLOCKS > 1) {
      out += tile_out * out_size * args.reduction_stride;
    }
    cub::StoreDirectBlocked(
        warp.meta_group_rank(),
        out + tile_y * args.reduction_stride + tile_x * BN,
@@ -247,12 +227,11 @@ __global__ void col_reduce_small(
 inline auto output_grid_for_col_reduce(
    const array& out,
    const cu::ColReduceArgs& args,
-    int bn,
+    int bn) {
    int outer = 1) {
  int gx, gy = 1;
  size_t n_inner_blocks = cuda::ceil_div(args.reduction_stride, bn);
  size_t n_outer_blocks = out.size() / args.reduction_stride;
-  size_t n_blocks = n_outer_blocks * n_inner_blocks * outer;
+  size_t n_blocks = n_outer_blocks * n_inner_blocks;
  while (n_blocks / gy > INT32_MAX) {
    gy *= 2;
  }
@@ -298,8 +277,7 @@ void col_reduce_looped(
            0,
            indata,
            gpu_ptr<U>(out),
-            static_cast<cu::ColReduceArgs>(args),
+            static_cast<cu::ColReduceArgs>(args));
            out.size() / args.reduction_stride);
      });
    });
  });
@@ -342,117 +320,6 @@ void col_reduce_small(
  });
 }
 void col_reduce_two_pass(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan,
    const cu::ColReduceArgs& args) {
  // Allocate data for the output using in's layout to access them as
  // contiguously as possible.
  allocate_same_layout(out, in, axes, encoder);
  // Allocate an intermediate array to hold the 1st pass result
  constexpr int outer = 32;
  Shape intermediate_shape;
  intermediate_shape.push_back(outer);
  intermediate_shape.insert(
      intermediate_shape.end(), out.shape().begin(), out.shape().end());
  Strides intermediate_strides;
  intermediate_strides.push_back(out.size());
  intermediate_strides.insert(
      intermediate_strides.end(), out.strides().begin(), out.strides().end());
  array intermediate(intermediate_shape, out.dtype(), nullptr, {});
  auto [data_size, rc, cc] =
      check_contiguity(intermediate_shape, intermediate_strides);
  auto fl = out.flags();
  fl.row_contiguous = rc;
  fl.col_contiguous = cc;
  fl.contiguous = true;
  intermediate.set_data(
      cu::malloc_async(intermediate.nbytes(), encoder),
      data_size,
      intermediate_strides,
      fl,
      allocator::free);
  encoder.add_temporary(intermediate);
  encoder.set_input_array(in);
  encoder.set_output_array(intermediate);
  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
        using OP = MLX_GET_TYPE(reduce_type_tag);
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        using U = typename cu::ReduceResult<OP, T>::type;
        // Cub doesn't like const pointers for vectorized loads. (sigh)
        T* indata = const_cast<T*>(gpu_ptr<T>(in));
        constexpr int N_READS = 4;
        constexpr int BM = 32;
        constexpr int BN = 32;
        dim3 grid = output_grid_for_col_reduce(out, args, BN, outer);
        int blocks = BM * BN / N_READS;
        auto kernel = cu::
            col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS, outer>;
        encoder.add_kernel_node(
            kernel,
            grid,
            blocks,
            0,
            indata,
            gpu_ptr<U>(intermediate),
            static_cast<cu::ColReduceArgs>(args),
            out.size() / args.reduction_stride);
      });
    });
  });
  // Prepare the reduction arguments for the 2nd pass
  cu::ColReduceArgs second_args = args;
  second_args.reduction_size = outer;
  second_args.reduction_stride = out.size();
  second_args.ndim = 0;
  second_args.reduce_shape[0] = outer;
  second_args.reduce_strides[0] = out.size();
  second_args.reduce_ndim = 1;
  second_args.non_col_reductions = 1;
  encoder.set_input_array(intermediate);
  encoder.set_output_array(out);
  dispatch_all_types(intermediate.dtype(), [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      dispatch_reduce_ndim(second_args.reduce_ndim, [&](auto reduce_ndim) {
        using OP = MLX_GET_TYPE(reduce_type_tag);
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        using U = typename cu::ReduceResult<OP, T>::type;
        constexpr int N_READS = 4;
        constexpr int BM = 32;
        constexpr int BN = 32;
        dim3 grid = output_grid_for_col_reduce(out, second_args, BN);
        int blocks = BM * BN / N_READS;
        auto kernel =
            cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
        encoder.add_kernel_node(
            kernel,
            grid,
            blocks,
            0,
            gpu_ptr<T>(intermediate),
            gpu_ptr<U>(out),
            second_args,
            second_args.reduction_stride);
      });
    });
  });
 }
 void col_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
@@ -467,18 +334,6 @@ void col_reduce(
  //   It is a general strided reduce. Each threadblock computes the output for
  //   a subrow of the fast moving axis. For instance 32 elements.
  //
  // - col_reduce_small
  //
  //  It is a column reduce for small columns. Each thread loops over the whole
  //  column without communicating with any other thread.
  //
  // - col_reduce_two_pass
  //
  //  It is a reduce for long columns. To increase parallelism, we split the
  //  reduction in two passes. First we do a column reduce where many
  //  threadblocks operate on different parts of the reduced axis. Then we
  //  perform a final column reduce.
  //
  // Notes: As in row reduce we opt to read as much in order as possible and
  //        leave transpositions as they are (contrary to our Metal backend).
  //
@@ -494,14 +349,6 @@ void col_reduce(
    return;
  }
  // Long column with smallish row
  size_t total_sums = args.non_col_reductions * args.reduction_size;
  size_t approx_threads = out.size();
  if (total_sums / approx_threads > 32) {
    col_reduce_two_pass(encoder, in, out, reduce_type, axes, plan, args);
    return;
  }
  // Fallback col reduce
  col_reduce_looped(encoder, in, out, reduce_type, axes, plan, args);
 }
--- a/mlx/backend/cuda/reduce/init_reduce.cu
+++ b/mlx/backend/cuda/reduce/init_reduce.cu
@@ -28,7 +28,7 @@ void init_reduce(
    Reduce::ReduceType reduce_type) {
  // Allocate if needed
  if (out.data_shared_ptr() == nullptr) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  }
  encoder.set_output_array(out);
--- a/mlx/backend/cuda/reduce/reduce_utils.cuh
+++ b/mlx/backend/cuda/reduce/reduce_utils.cuh
@@ -96,7 +96,7 @@ inline void allocate_same_layout(
    const std::vector<int>& axes,
    cu::CommandEncoder& encoder) {
  if (in.flags().row_contiguous) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    return;
  }
@@ -135,7 +135,7 @@ inline void allocate_same_layout(
  fl.col_contiguous = cc;
  fl.contiguous = true;
  out.set_data(
-      cu::malloc_async(out.nbytes(), encoder),
+      cu::malloc_async(out.nbytes(), encoder.stream()),
      data_size,
      final_strides,
      fl,
--- a/mlx/backend/cuda/rms_norm.cu
+++ b/mlx/backend/cuda/rms_norm.cu
@@ -22,28 +22,26 @@ inline __device__ float2 plus_f2(const float2& a, const float2& b) {
 }
 // Similar to cub::BlockReduce, but result is broadcasted to every thread.
-template <typename T, int BLOCK_DIM, int GROUP_DIM = WARP_SIZE>
+template <typename T, int BLOCK_DIM>
 struct BlockBroadcastReduce {
-  using TempStorage = T[std::max(BLOCK_DIM / WARP_SIZE, 1)];
+  static_assert(WARP_SIZE <= BLOCK_DIM && BLOCK_DIM <= WARP_SIZE * WARP_SIZE);
  static_assert(BLOCK_DIM % WARP_SIZE == 0);
  using TempStorage = T[BLOCK_DIM / WARP_SIZE];
  cg::thread_block& block;
  TempStorage& temp;
  template <typename Op>
  __device__ T Reduce(const T& input, const Op& op, const T& init_value) {
-    auto warp = cg::tiled_partition<GROUP_DIM>(block);
+    auto warp = cg::tiled_partition<WARP_SIZE>(block);
    T x = cg::reduce(warp, input, op);
-    if constexpr (BLOCK_DIM > GROUP_DIM) {
+    if (warp.thread_rank() == 0) {
-      if (warp.thread_rank() == 0) {
+      temp[warp.meta_group_rank()] = x;
        temp[warp.meta_group_rank()] = x;
      }
      block.sync();
      x = warp.thread_rank() < warp.meta_group_size() ? temp[warp.thread_rank()]
                                                      : init_value;
      return cg::reduce(warp, x, op);
    } else {
      return x;
    }
    block.sync();
    x = warp.thread_rank() < warp.meta_group_size() ? temp[warp.thread_rank()]
                                                    : init_value;
    return cg::reduce(warp, x, op);
  }
  __device__ T Sum(const T& input) {
@@ -51,52 +49,6 @@ struct BlockBroadcastReduce {
  }
 };
 template <typename T, int BLOCK_DIM, int REDUCE_DIM, int N_READS = 4>
 __global__ void rms_norm_small(
    const T* x,
    const T* w,
    T* out,
    float eps,
    uint32_t axis_size,
    uint32_t n_rows,
    int64_t w_stride) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  using BlockReduceT = BlockBroadcastReduce<float, BLOCK_DIM, REDUCE_DIM>;
  __shared__ typename BlockReduceT::TempStorage temp;
  auto row =
      (grid.block_rank() * block.dim_threads().y) + block.thread_index().y;
  if (row >= n_rows) {
    return;
  }
  x += row * axis_size;
  out += row * axis_size;
  // Normalizer.
  float normalizer = 0;
  auto index = block.thread_index().x;
  auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
 #pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    float t = static_cast<float>(xn[i]);
    normalizer += t * t;
  }
  normalizer = BlockReduceT{block, temp}.Sum(normalizer);
  normalizer = rsqrt(normalizer / axis_size + eps);
  // Outputs.
  auto wn = load_vector<N_READS>(w, index, axis_size, w_stride, T(0));
 #pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    float y = static_cast<float>(xn[i]) * normalizer;
    xn[i] = wn[i] * static_cast<T>(y);
  }
  store_vector<N_READS>(out, index, xn, axis_size);
 }
 template <typename T, int BLOCK_DIM, int N_READS = 4>
 __global__ void rms_norm(
    const T* x,
@@ -142,74 +94,6 @@ __global__ void rms_norm(
  }
 }
 template <
    typename T,
    bool HAS_W,
    int BLOCK_DIM,
    int REDUCE_DIM,
    int N_READS = 4>
 __global__ void rms_norm_vjp_small(
    const T* x,
    const T* w,
    const T* g,
    T* gx,
    T* gw,
    float eps,
    int32_t axis_size,
    int32_t n_rows,
    int64_t w_stride) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  using BlockReduceF2 = BlockBroadcastReduce<float2, BLOCK_DIM, REDUCE_DIM>;
  __shared__ typename BlockReduceF2::TempStorage temp;
  auto row =
      (grid.block_rank() * block.dim_threads().y) + block.thread_index().y;
  if (row >= n_rows) {
    return;
  }
  x += row * axis_size;
  g += row * axis_size;
  gx += row * axis_size;
  gw += row * axis_size;
  // Normalizer.
  float2 factors = {};
  auto index = block.thread_index().x;
  auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
  auto gn = load_vector<N_READS>(g, index, axis_size, T(0));
  auto wn = load_vector<N_READS>(w, index, axis_size, w_stride, T(0));
  for (int i = 0; i < N_READS; i++) {
    float t = static_cast<float>(xn[i]);
    float wi = wn[i];
    float gi = gn[i];
    float wg = wi * gi;
    factors = plus_f2(factors, {wg * t, t * t});
  }
  factors = BlockReduceF2{block, temp}.Reduce(factors, plus_f2, {});
  float meangwx = factors.x / axis_size;
  float normalizer = rsqrt(factors.y / axis_size + eps);
  float normalizer3 = normalizer * normalizer * normalizer;
  // Outputs.
  for (int i = 0; i < N_READS; i++) {
    float xi = xn[i];
    float wi = wn[i];
    float gi = gn[i];
    xn[i] = static_cast<T>(normalizer * wi * gi - xi * meangwx * normalizer3);
    if constexpr (HAS_W) {
      wn[i] = static_cast<T>(gi * xi * normalizer);
    }
  }
  store_vector<N_READS>(gx, index, xn, axis_size);
  if constexpr (HAS_W) {
    store_vector<N_READS>(gw, index, wn, axis_size);
  }
 }
 template <typename T, bool HAS_W, int BLOCK_DIM, int N_READS = 4>
 __global__ void rms_norm_vjp(
    const T* x,
@@ -223,8 +107,12 @@ __global__ void rms_norm_vjp(
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  using BlockReduceF = BlockBroadcastReduce<float, BLOCK_DIM>;
  using BlockReduceF2 = BlockBroadcastReduce<float2, BLOCK_DIM>;
-  __shared__ typename BlockReduceF2::TempStorage temp;
+  __shared__ union {
    typename BlockReduceF::TempStorage f;
    typename BlockReduceF2::TempStorage f2;
  } temp;
  x += grid.block_rank() * axis_size;
  g += grid.block_rank() * axis_size;
@@ -246,7 +134,7 @@ __global__ void rms_norm_vjp(
      factors = plus_f2(factors, {wg * t, t * t});
    }
  }
-  factors = BlockReduceF2{block, temp}.Reduce(factors, plus_f2, {});
+  factors = BlockReduceF2{block, temp.f2}.Reduce(factors, plus_f2, {});
  float meangwx = factors.x / axis_size;
  float normalizer = rsqrt(factors.y / axis_size + eps);
  float normalizer3 = normalizer * normalizer * normalizer;
@@ -281,43 +169,6 @@ bool RMSNorm::use_fallback(Stream s) {
  return s.device == Device::cpu;
 }
 template <int n_per_thread, typename F>
 void dispatch_group_dim(int axis_size, F&& f) {
  if (axis_size <= n_per_thread * 8) {
    f(std::integral_constant<int, 8>{},
      std::integral_constant<int, 1>(),
      std::integral_constant<int, 16>());
  } else if (axis_size <= n_per_thread * 16) {
    f(std::integral_constant<int, 16>{},
      std::integral_constant<int, 1>(),
      std::integral_constant<int, 8>());
  } else if (axis_size <= n_per_thread * 32) {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 1>(),
      std::integral_constant<int, 4>());
  } else if (axis_size <= n_per_thread * 32 * 2) {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 2>(),
      std::integral_constant<int, 2>());
  } else if (axis_size <= n_per_thread * 32 * 4) {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 4>(),
      std::integral_constant<int, 1>());
  } else if (axis_size <= n_per_thread * 32 * 8) {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 8>(),
      std::integral_constant<int, 1>());
  } else if (axis_size <= n_per_thread * 32 * 16) {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 16>(),
      std::integral_constant<int, 1>());
  } else {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 32>(),
      std::integral_constant<int, 1>());
  }
 }
 // TODO: There are duplicate code with backend/metal/normalization.cpp
 void RMSNorm::eval_gpu(
    const std::vector<array>& inputs,
@@ -339,7 +190,7 @@ void RMSNorm::eval_gpu(
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
-            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
+            cu::malloc_async(x.data_size() * x.itemsize(), encoder.stream()),
            x.data_size(),
            x.strides(),
            x.flags());
@@ -365,33 +216,12 @@ void RMSNorm::eval_gpu(
  dispatch_float_types(out.dtype(), "rms_norm", [&](auto type_tag) {
    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    constexpr int N_READS = 16 / sizeof(DataType);
-    if (axis_size <= N_READS * 1024) {
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      dispatch_group_dim<N_READS>(
+      auto kernel = cu::rms_norm<DataType, block_dim(), N_READS>;
          axis_size, [&](auto group_dim, auto n_groups, auto groups_per_block) {
            constexpr int block_dim = n_groups() * group_dim();
            auto kernel =
                cu::rms_norm_small<DataType, block_dim, group_dim(), N_READS>;
            auto n_blocks =
                (n_rows + groups_per_block() - 1) / groups_per_block();
            encoder.add_kernel_node(
                kernel,
                n_blocks,
                {block_dim, groups_per_block()},
                0,
                gpu_ptr<DataType>(x),
                gpu_ptr<DataType>(w),
                gpu_ptr<DataType>(out),
                eps_,
                axis_size,
                n_rows,
                w_stride);
          });
    } else {
      auto kernel = cu::rms_norm<DataType, 1024, N_READS>;
      encoder.add_kernel_node(
          kernel,
          n_rows,
-          1024,
+          block_dim(),
          0,
          gpu_ptr<DataType>(x),
          gpu_ptr<DataType>(w),
@@ -399,7 +229,7 @@ void RMSNorm::eval_gpu(
          eps_,
          axis_size,
          w_stride);
-    }
+    });
  });
 }
@@ -444,7 +274,7 @@ void RMSNormVJP::eval_gpu(
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
-    gx.set_data(cu::malloc_async(gx.nbytes(), encoder));
+    gx.set_data(cu::malloc_async(gx.nbytes(), encoder.stream()));
  }
  if (g_copied && !g_in_gx) {
    encoder.add_temporary(g);
@@ -462,7 +292,7 @@ void RMSNormVJP::eval_gpu(
    if (!g_in_gx && donate_g) {
      gw_temp.copy_shared_buffer(g);
    } else {
-      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder));
+      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder.stream()));
      encoder.add_temporary(gw_temp);
    }
  }
@@ -476,51 +306,27 @@ void RMSNormVJP::eval_gpu(
    dispatch_bool(has_w, [&](auto has_w_constant) {
      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      constexpr int N_READS = 16 / sizeof(DataType);
-      if (axis_size <= N_READS * 1024) {
+      dispatch_block_dim(
-        dispatch_group_dim<N_READS>(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-            axis_size,
+            auto kernel = cu::rms_norm_vjp<
-            [&](auto group_dim, auto n_groups, auto groups_per_block) {
+                DataType,
-              constexpr int block_dim = group_dim() * n_groups();
+                has_w_constant.value,
-              auto kernel = cu::rms_norm_vjp_small<
+                block_dim(),
-                  DataType,
+                N_READS>;
-                  has_w_constant.value,
+            encoder.add_kernel_node(
-                  block_dim,
+                kernel,
-                  group_dim(),
+                n_rows,
-                  N_READS>;
+                block_dim(),
-              auto n_blocks =
+                0,
-                  (n_rows + groups_per_block() - 1) / groups_per_block();
+                gpu_ptr<DataType>(x),
-              encoder.add_kernel_node(
+                gpu_ptr<DataType>(w),
-                  kernel,
+                gpu_ptr<DataType>(g),
-                  n_blocks,
+                gpu_ptr<DataType>(gx),
-                  {block_dim, groups_per_block()},
+                gpu_ptr<DataType>(gw_temp),
-                  0,
+                eps_,
-                  gpu_ptr<DataType>(x),
+                axis_size,
-                  gpu_ptr<DataType>(w),
+                w_stride);
-                  gpu_ptr<DataType>(g),
+          });
                  gpu_ptr<DataType>(gx),
                  gpu_ptr<DataType>(gw_temp),
                  eps_,
                  axis_size,
                  n_rows,
                  w_stride);
            });
      } else {
        auto kernel =
            cu::rms_norm_vjp<DataType, has_w_constant.value, 1024, N_READS>;
        encoder.add_kernel_node(
            kernel,
            n_rows,
            1024,
            0,
            gpu_ptr<DataType>(x),
            gpu_ptr<DataType>(w),
            gpu_ptr<DataType>(g),
            gpu_ptr<DataType>(gx),
            gpu_ptr<DataType>(gw_temp),
            eps_,
            axis_size,
            w_stride);
      }
    });
  });
--- a/mlx/backend/cuda/rope.cu
+++ b/mlx/backend/cuda/rope.cu
@@ -292,14 +292,14 @@ void RoPE::eval_gpu(
      donated = true;
      out.copy_shared_buffer(in);
    } else {
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    }
    strides[0] = mat_size;
    strides[1] = in.strides()[ndim - 2];
    strides[2] = in.strides()[ndim - 1];
  } else if (dispatch_ndim == 3) {
    // Handle non-contiguous 3D inputs
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    strides[0] = in.strides()[ndim - 3];
    strides[1] = in.strides()[ndim - 2];
    strides[2] = in.strides()[ndim - 1];
--- a/mlx/backend/cuda/scaled_dot_product_attention.cpp
+++ b/mlx/backend/cuda/scaled_dot_product_attention.cpp
@@ -5,13 +5,47 @@
 #include "mlx/backend/cuda/lru_cache.h"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/fast_primitives.h"
 #include "mlx/transforms_impl.h"
 #include <nvtx3/nvtx3.hpp>
 namespace mlx::core {
 namespace fe = cudnn_frontend;
 namespace {
 #define CHECK_CUDNN_FE_ERROR(cmd)                                    \
  do {                                                               \
    auto error = cmd;                                                \
    if (!error.is_good()) {                                          \
      throw std::runtime_error(                                      \
          fmt::format("{} failed: {}.", #cmd, error.get_message())); \
    }                                                                \
  } while (0)
 std::vector<int64_t> normalized_strides(const array& x) {
  std::vector<int64_t> strides(x.strides().begin(), x.strides().end());
  if (!x.flags().row_contiguous || x.ndim() < 2) {
    return strides;
  }
  for (int i = x.ndim() - 2; i >= 0; --i) {
    if (x.shape(i) == 1) {
      strides[i] = x.shape(i + 1) * strides[i + 1];
    }
  }
  return strides;
 }
 void set_tensor_attrs(
    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
    int64_t uid,
    const array& x) {
  tensor->set_uid(uid)
      .set_dim({x.shape().begin(), x.shape().end()})
      .set_stride(normalized_strides(x));
 }
 array prepare_sdpa_input(const array& x, Stream s) {
  // SDPA kernel's requirements on inputs:
  // 1. last dim's stride be 1;
@@ -25,43 +59,11 @@ array prepare_sdpa_input(const array& x, Stream s) {
  return x;
 }
 void malloc_with_same_layout(
    cu::CommandEncoder& encoder,
    array& o,
    const array& q) {
  if (q.flags().row_contiguous) {
    o.set_data(cu::malloc_async(o.nbytes(), encoder));
    return;
  }
  // fill_order = argsort(q.strides())
  Shape fill_order(q.ndim());
  std::iota(fill_order.begin(), fill_order.end(), 0);
  std::stable_sort(
      fill_order.begin(), fill_order.end(), [&q](int idx1, int idx2) {
        auto s1 = q.strides(idx1) > 0 ? q.strides(idx1) : 1;
        auto s2 = q.strides(idx2) > 0 ? q.strides(idx2) : 1;
        return s1 < s2;
      });
  // Generate o_strides with fill_order
  Strides o_strides(q.ndim());
  int64_t stride = 1;
  for (int i : fill_order) {
    o_strides[i] = stride;
    stride *= o.shape(i);
  }
  // o is a transposed contiguous array
  o.set_data(
      cu::malloc_async(o.nbytes(), encoder),
      o.size(),
      o_strides,
      {true, false, false});
 }
 constexpr int QKV_NDIM = 4;
 struct SDPACacheKey {
  int device_id;
-  fe::DataType_t cudnn_dtype;
+  cudnnDataType_t cudnn_dtype;
  std::array<int, QKV_NDIM> q_shape;
  std::array<int, QKV_NDIM> k_shape;
  std::array<int, QKV_NDIM> v_shape;
@@ -69,50 +71,11 @@ struct SDPACacheKey {
  std::array<int64_t, QKV_NDIM> k_strides;
  std::array<int64_t, QKV_NDIM> v_strides;
  bool do_causal;
  std::array<int, QKV_NDIM> mask_shape;
  std::array<int64_t, QKV_NDIM> mask_strides;
  bool output_logsumexp;
 };
 inline BytesKey<SDPACacheKey> build_sdpa_cache_key(
    cu::CommandEncoder& encoder,
    const array& q,
    const array& k,
    const array& v,
    bool do_causal,
    const std::optional<array>& mask_arr,
    bool output_logsumexp = true) {
  BytesKey<SDPACacheKey> cache_key;
  cache_key.pod = {
      encoder.device().cuda_device(),
      dtype_to_cudnn_type(q.dtype()),
      vector_key<QKV_NDIM>(q.shape()),
      vector_key<QKV_NDIM>(k.shape()),
      vector_key<QKV_NDIM>(v.shape()),
      vector_key<QKV_NDIM>(q.strides()),
      vector_key<QKV_NDIM>(k.strides()),
      vector_key<QKV_NDIM>(v.strides()),
      do_causal,
      {},
      {},
      output_logsumexp,
  };
  if (mask_arr) {
    cache_key.pod.mask_shape = vector_key<QKV_NDIM>(mask_arr->shape());
    cache_key.pod.mask_strides = vector_key<QKV_NDIM>(mask_arr->strides());
  }
  return cache_key;
 }
 auto& sdpa_cache() {
-  static LRUBytesKeyCache<SDPACacheKey, DnnGraph> cache(
+  static LRUBytesKeyCache<SDPACacheKey, fe::graph::Graph> cache(
-      "MLX_CUDA_SDPA_CACHE_SIZE", /* default_capacity */ 64);
+      "MLX_CUDA_SDPA_CACHE_SIZE", /* default_capacity */ 128);
  return cache;
 }
 auto& sdpa_backward_cache() {
  static LRUBytesKeyCache<SDPACacheKey, DnnGraph> cache(
      "MLX_CUDA_SDPA_BACKWARD_CACHE_SIZE", /* default_capacity */ 64);
  return cache;
 }
@@ -121,106 +84,59 @@ enum UIDS {
  K,
  V,
  SCALE,
  BIAS,
  O,
  STATS,
  // Backward graph:
  D_Q,
  D_K,
  D_V,
  D_O,
 };
-DnnGraph build_sdpa_graph(
+fe::graph::Graph build_sdpa_graph(
    cudnnHandle_t handle,
    const array& q,
    const array& k,
    const array& v,
    bool do_causal,
-    const std::optional<array>& mask_arr,
+    const array& o) {
-    bool output_logsumexp,
+  auto dtype = fe::DataType_t::HALF;
-    const array& o,
+  if (q.dtype() == bfloat16) {
-    const array& stats) {
+    dtype = fe::DataType_t::BFLOAT16;
  DnnGraph graph(handle, q.dtype());
  auto q_ = graph.tensor("Q", Q, q);
  auto k_ = graph.tensor("K", K, k);
  auto v_ = graph.tensor("V", V, v);
  auto options = fe::graph::SDPA_attributes()
                     .set_name("sdpa_cudnn")
                     .set_attn_scale(graph.scalar("Scale", SCALE, float32))
                     .set_generate_stats(output_logsumexp);
  if (do_causal) {
    if (q.shape(2) > k.shape(2)) {
      options.set_causal_mask(do_causal);
    } else {
      options.set_causal_mask_bottom_right(do_causal);
    }
  }
  if (mask_arr) {
    options.set_bias(graph.tensor("BIAS", BIAS, *mask_arr));
  }
-  auto [o_, stats_] = graph.sdpa(q_, k_, v_, options);
+  fe::graph::Graph graph;
-  graph.tensor(o_, O, o)->set_output(true);
+  graph.set_io_data_type(dtype)
-  if (output_logsumexp) {
+      .set_intermediate_data_type(fe::DataType_t::FLOAT)
-    graph.tensor(stats_, STATS, stats)->set_output(true);
+      .set_compute_data_type(fe::DataType_t::FLOAT);
  }
-  CHECK_CUDNN_FE_ERROR(graph.prepare());
+  auto q_ = graph.tensor(fe::graph::Tensor_attributes().set_name("Q"));
  auto k_ = graph.tensor(fe::graph::Tensor_attributes().set_name("K"));
  auto v_ = graph.tensor(fe::graph::Tensor_attributes().set_name("V"));
  set_tensor_attrs(q_, Q, q);
  set_tensor_attrs(k_, K, k);
  set_tensor_attrs(v_, V, v);
  auto scale = graph.tensor(fe::graph::Tensor_attributes()
                                .set_name("Scale")
                                .set_uid(SCALE)
                                .set_dim({1, 1, 1, 1})
                                .set_stride({1, 1, 1, 1})
                                .set_is_pass_by_value(true)
                                .set_data_type(fe::DataType_t::FLOAT));
  auto sdpa_options = fe::graph::SDPA_attributes()
                          .set_name("sdpa_cudnn")
                          .set_attn_scale(scale)
                          .set_causal_mask(do_causal)
                          .set_generate_stats(false);
  auto [o_, _] = graph.sdpa(q_, k_, v_, sdpa_options);
  o_->set_output(true);
  set_tensor_attrs(o_, O, o);
  CHECK_CUDNN_FE_ERROR(graph.validate());
  CHECK_CUDNN_FE_ERROR(graph.build_operation_graph(handle));
  CHECK_CUDNN_FE_ERROR(graph.create_execution_plans({fe::HeurMode_t::A}));
  graph.select_behavior_notes(
      {fe::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
-  CHECK_CUDNN_FE_ERROR(graph.build());
+  CHECK_CUDNN_FE_ERROR(graph.check_support(handle));
-  return graph;
+  CHECK_CUDNN_FE_ERROR(graph.build_plans(handle));
 }
 DnnGraph build_sdpa_backward_graph(
    cudnnHandle_t handle,
    const array& q,
    const array& k,
    const array& v,
    bool do_causal,
    const std::optional<array>& mask_arr,
    const array& o,
    const array& d_o,
    const array& stats,
    array& d_q,
    array& d_k,
    array& d_v) {
  DnnGraph graph(handle, q.dtype());
  auto q_ = graph.tensor("Q", Q, q);
  auto k_ = graph.tensor("K", K, k);
  auto v_ = graph.tensor("V", V, v);
  auto o_ = graph.tensor("O", O, o);
  auto d_o_ = graph.tensor("D_O", D_O, d_o);
  auto stats_ = graph.tensor("STATS", STATS, stats);
  auto options = fe::graph::SDPA_backward_attributes()
                     .set_name("sdpa_backward_cudnn")
                     .set_attn_scale(graph.scalar("Scale", SCALE, float32));
  if (do_causal) {
    if (q.shape(2) > k.shape(2)) {
      options.set_causal_mask(do_causal);
    } else {
      options.set_causal_mask_bottom_right(do_causal);
    }
  }
  if (mask_arr) {
    options.set_bias(graph.tensor("BIAS", BIAS, *mask_arr));
  }
  auto [d_q_, d_k_, d_v_] =
      graph.sdpa_backward(q_, k_, v_, o_, d_o_, stats_, options);
  graph.tensor(d_q_, D_Q, d_q)->set_output(true);
  graph.tensor(d_k_, D_K, d_k)->set_output(true);
  graph.tensor(d_v_, D_V, d_v)->set_output(true);
  CHECK_CUDNN_FE_ERROR(graph.prepare());
  graph.select_behavior_notes(
      {fe::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
  CHECK_CUDNN_FE_ERROR(graph.build());
  return graph;
 }
@@ -230,6 +146,7 @@ bool supports_sdpa_cudnn(
    const array& q,
    const array& k,
    const array& v,
    bool has_mask,
    bool do_causal,
    Stream s) {
  static bool enabled = env::get_var("MLX_CUDA_USE_CUDNN_SPDA", 1);
@@ -242,8 +159,19 @@ bool supports_sdpa_cudnn(
    return false;
  }
-  // Only use cuDNN for prefilling (T_q > 1) and training (T_q == T_kv).
+  if (has_mask) {
-  if ((q.shape(2) == 1) && (q.shape(2) != k.shape(2))) {
+    // TODO: Support array masks.
    if (!do_causal) {
      return false;
    }
    // FIXME: Causal mask generates wrong results when L_Q != L_K.
    if (q.shape(2) != k.shape(2)) {
      return false;
    }
  }
  // Only use cuDNN for prefilling.
  if (q.shape(2) != k.shape(2)) {
    return false;
  }
@@ -263,115 +191,66 @@ void sdpa_cudnn(
    const array& v,
    float scale,
    array& o,
    array& stats,
    bool do_causal,
    const std::optional<array>& mask_arr,
    bool output_logsumexp,
    Stream s) {
  auto& encoder = cu::get_command_encoder(s);
-  auto handle = encoder.device().cudnn_handle();
+  // TODO: Handle donation.
-
+  // TODO: Make O use same memory layout with Q.
-  malloc_with_same_layout(encoder, o, q);
+  o.set_data(cu::malloc_async(o.nbytes(), encoder.stream()));
  encoder.set_input_array(q);
  encoder.set_input_array(k);
  encoder.set_input_array(v);
  encoder.set_output_array(o);
-  if (mask_arr) {
+
-    encoder.set_input_array(*mask_arr);
+  auto handle = encoder.device().cudnn_handle();
-  }
+  cudnnSetStream(handle, encoder.stream());
  if (output_logsumexp) {
    stats.set_data(cu::malloc_async(stats.nbytes(), encoder));
    encoder.set_output_array(stats);
  }
  // Search cache.
-  auto cache_key = build_sdpa_cache_key(
+  BytesKey<SDPACacheKey> cache_key;
-      encoder, q, k, v, do_causal, mask_arr, output_logsumexp);
+  cache_key.pod = {
      encoder.device().cuda_device(),
      dtype_to_cudnn_type(q.dtype()),
      vector_key<QKV_NDIM>(q.shape()),
      vector_key<QKV_NDIM>(k.shape()),
      vector_key<QKV_NDIM>(v.shape()),
      vector_key<QKV_NDIM>(q.strides()),
      vector_key<QKV_NDIM>(k.strides()),
      vector_key<QKV_NDIM>(v.strides()),
      do_causal,
  };
  auto it = sdpa_cache().find(cache_key);
  if (it == sdpa_cache().end()) {
-    auto graph = build_sdpa_graph(
+    it =
-        handle, q, k, v, do_causal, mask_arr, output_logsumexp, o, stats);
+        sdpa_cache()
-    it = sdpa_cache().emplace(cache_key, std::move(graph)).first;
+            .emplace(cache_key, build_sdpa_graph(handle, q, k, v, do_causal, o))
            .first;
  }
  auto& graph = it->second;
  std::unordered_map<int64_t, void*> variant_pack{
-      {Q, gpu_ptr<void>(q)},
+      {Q, const_cast<void*>(gpu_ptr<void>(q))},
-      {K, gpu_ptr<void>(k)},
+      {K, const_cast<void*>(gpu_ptr<void>(k))},
-      {V, gpu_ptr<void>(v)},
+      {V, const_cast<void*>(gpu_ptr<void>(v))},
      {SCALE, &scale},
      {O, gpu_ptr<void>(o)}};
-  if (mask_arr) {
+
-    variant_pack[BIAS] = gpu_ptr<void>(*mask_arr);
+  int64_t workspace_size = 0;
-  }
+  CHECK_CUDNN_FE_ERROR(graph.get_workspace_size(workspace_size));
-  if (output_logsumexp) {
+  void* workspace_ptr = nullptr;
-    variant_pack[STATS] = gpu_ptr<void>(stats);
+  if (workspace_size > 0) {
    array workspace(
        cu::malloc_async(workspace_size, encoder.stream()),
        {static_cast<int>(workspace_size)},
        uint8);
    encoder.add_temporary(workspace);
    workspace_ptr = gpu_ptr<void>(workspace);
  }
-  CHECK_CUDNN_FE_ERROR(graph.encode_graph(encoder, std::move(variant_pack)));
+  CudaGraph cuda_graph(encoder.device());
-}
+  CHECK_CUDNN_FE_ERROR(graph.populate_cuda_graph(
-
+      handle, variant_pack, workspace_ptr, cuda_graph));
-void sdpa_backward_cudnn(
+  encoder.add_graph_node(cuda_graph);
    const array& q,
    const array& k,
    const array& v,
    float scale,
    const array& o,
    const array& stats,
    bool do_causal,
    const std::optional<array>& mask_arr,
    const array& d_o,
    array& d_q,
    array& d_k,
    array& d_v,
    Stream s) {
  auto& encoder = cu::get_command_encoder(s);
  auto handle = encoder.device().cudnn_handle();
  malloc_with_same_layout(encoder, d_q, q);
  malloc_with_same_layout(encoder, d_k, k);
  malloc_with_same_layout(encoder, d_v, v);
  encoder.set_input_array(q);
  encoder.set_input_array(k);
  encoder.set_input_array(v);
  encoder.set_input_array(o);
  encoder.set_input_array(stats);
  encoder.set_input_array(d_o);
  encoder.set_output_array(d_q);
  encoder.set_output_array(d_k);
  encoder.set_output_array(d_v);
  if (mask_arr) {
    encoder.set_input_array(*mask_arr);
  }
  // Search cache.
  auto cache_key = build_sdpa_cache_key(encoder, q, k, v, do_causal, mask_arr);
  auto it = sdpa_backward_cache().find(cache_key);
  if (it == sdpa_backward_cache().end()) {
    auto graph = build_sdpa_backward_graph(
        handle, q, k, v, do_causal, mask_arr, o, d_o, stats, d_q, d_k, d_v);
    it = sdpa_backward_cache().emplace(cache_key, std::move(graph)).first;
  }
  auto& graph = it->second;
  std::unordered_map<int64_t, void*> variant_pack{
      {Q, gpu_ptr<void>(q)},
      {K, gpu_ptr<void>(k)},
      {V, gpu_ptr<void>(v)},
      {SCALE, &scale},
      {O, gpu_ptr<void>(o)},
      {STATS, gpu_ptr<void>(stats)},
      {D_O, gpu_ptr<void>(d_o)},
      {D_Q, gpu_ptr<void>(d_q)},
      {D_K, gpu_ptr<void>(d_k)},
      {D_V, gpu_ptr<void>(d_v)}};
  if (mask_arr) {
    variant_pack[BIAS] = gpu_ptr<void>(*mask_arr);
  }
  CHECK_CUDNN_FE_ERROR(graph.encode_graph(encoder, std::move(variant_pack)));
 }
 // Defined in scaled_dot_product_attention.cu file.
@@ -381,8 +260,7 @@ bool supports_sdpa_vector(
    const array& v,
    bool has_mask,
    bool has_arr_mask,
-    bool do_causal,
+    bool do_causal);
    bool output_logsumexp);
 void sdpa_vector(
    const array& q,
    const array& k,
@@ -402,25 +280,21 @@ bool ScaledDotProductAttention::use_fallback(
    bool has_mask,
    bool has_arr_mask,
    bool do_causal,
    bool is_training,
    bool output_logsumexp,
    Stream s) {
  if (detail::in_grad_tracing()) {
    return true;
  }
  if (s.device == Device::cpu) {
    return true;
  }
-  return !supports_sdpa_vector(
+  return !supports_sdpa_vector(q, k, v, has_mask, has_arr_mask, do_causal) &&
-             q, k, v, has_mask, has_arr_mask, do_causal, output_logsumexp) &&
+      !supports_sdpa_cudnn(q, k, v, has_mask, do_causal, s);
      !supports_sdpa_cudnn(q, k, v, do_causal, s);
 }
 bool ScaledDotProductAttention::supports_bool_mask() {
  return false;
 }
 void ScaledDotProductAttention::eval_gpu(
    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
+    array& out) {
  nvtx3::scoped_range r("ScaledDotProductAttention::eval_gpu");
  auto& s = stream();
@@ -428,79 +302,20 @@ void ScaledDotProductAttention::eval_gpu(
  array q = prepare_sdpa_input(inputs[0], s);
  array k = prepare_sdpa_input(inputs[1], s);
  array v = prepare_sdpa_input(inputs[2], s);
  auto& out = outputs[0];
  auto& stats = outputs[1];
  bool has_mask = inputs.size() - has_sinks_ > 3;
  bool has_arr_mask = has_mask && !do_causal_;
-  std::optional<array> mask_arr;
+  if (supports_sdpa_vector(q, k, v, has_mask, has_arr_mask, do_causal_)) {
  if (has_arr_mask) {
    mask_arr = prepare_sdpa_input(inputs[3], s);
  }
  if (supports_sdpa_vector(
          q, k, v, has_mask, has_arr_mask, do_causal_, output_logsumexp_)) {
    if (has_sinks_) {
      sdpa_vector(q, k, v, scale_, out, do_causal_, inputs.back(), s);
    } else {
      sdpa_vector(q, k, v, scale_, out, do_causal_, std::nullopt, s);
    }
  } else {
-    sdpa_cudnn(
+    sdpa_cudnn(q, k, v, scale_, out, do_causal_, s);
        q,
        k,
        v,
        scale_,
        out,
        stats,
        do_causal_,
        mask_arr,
        output_logsumexp_,
        s);
  }
 }
 bool ScaledDotProductAttentionVJP::use_fallback(const array& q, Stream s) {
  // The frontend adds a padding mask when sequence length is not a multiple of
  // tile size.
  if (q.shape(2) % 128 != 0) {
    return true;
  }
  return s.device == Device::cpu;
 }
 void ScaledDotProductAttentionVJP::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("ScaledDotProductAttentionVJP::eval_gpu");
  auto& s = stream();
  assert(inputs.size() >= 6);
  int primals_size = inputs.size() - 3;
  bool has_arr_mask = primals_size > 3 + has_sinks_;
  array q = prepare_sdpa_input(inputs[0], s);
  array k = prepare_sdpa_input(inputs[1], s);
  array v = prepare_sdpa_input(inputs[2], s);
  array o = prepare_sdpa_input(inputs[primals_size], s);
  array stats = prepare_sdpa_input(inputs[primals_size + 1], s);
  array d_o = prepare_sdpa_input(inputs[primals_size + 2], s);
  std::optional<array> mask_arr;
  if (has_arr_mask) {
    mask_arr = prepare_sdpa_input(inputs[3], s);
  }
  assert(outputs.size() == 3);
  auto& d_q = outputs[0];
  auto& d_k = outputs[1];
  auto& d_v = outputs[2];
  sdpa_backward_cudnn(
      q, k, v, scale_, o, stats, do_causal_, mask_arr, d_o, d_q, d_k, d_v, s);
 }
 } // namespace fast
 } // namespace mlx::core
--- a/mlx/backend/cuda/scaled_dot_product_attention.cu
+++ b/mlx/backend/cuda/scaled_dot_product_attention.cu
@@ -561,9 +561,10 @@ void sdpa_vector_2pass_fallback(
  array sums(intermediate_shape, float32, nullptr, {});
  array maxs(std::move(intermediate_shape), float32, nullptr, {});
-  intermediate.set_data(cu::malloc_async(intermediate.nbytes(), encoder));
+  intermediate.set_data(
-  sums.set_data(cu::malloc_async(sums.nbytes(), encoder));
+      cu::malloc_async(intermediate.nbytes(), encoder.stream()));
-  maxs.set_data(cu::malloc_async(maxs.nbytes(), encoder));
+  sums.set_data(cu::malloc_async(sums.nbytes(), encoder.stream()));
  maxs.set_data(cu::malloc_async(maxs.nbytes(), encoder.stream()));
  encoder.add_temporary(intermediate);
  encoder.add_temporary(sums);
@@ -664,12 +665,7 @@ bool supports_sdpa_vector(
    const array& v,
    bool has_mask,
    bool has_arr_mask,
-    bool do_causal,
+    bool do_causal) {
    bool output_logsumexp) {
  if (output_logsumexp) {
    return false;
  }
  const int value_head_dim = v.shape(-1);
  const int query_head_dim = q.shape(-1);
  const int query_sequence_length = q.shape(2);
@@ -773,7 +769,7 @@ void sdpa_vector(
      };
      o.set_data(
-          cu::malloc_async(o.nbytes(), encoder),
+          cu::malloc_async(o.nbytes(), encoder.stream()),
          o.size(),
          {str_oB, str_oH, str_oL, str_oD},
          flags);
--- a/mlx/backend/cuda/scan.cu
+++ b/mlx/backend/cuda/scan.cu
@@ -374,7 +374,7 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
      out.copy_shared_buffer(in);
    } else {
      out.set_data(
-          cu::malloc_async(in.data_size() * out.itemsize(), encoder),
+          cu::malloc_async(in.data_size() * out.itemsize(), encoder.stream()),
          in.data_size(),
          in.strides(),
          in.flags());
--- a/mlx/backend/cuda/slicing.cpp
+++ b/mlx/backend/cuda/slicing.cpp
@@ -24,7 +24,7 @@ void concatenate_gpu(
  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  auto strides = out.strides();
  auto flags = out.flags();
@@ -89,7 +89,7 @@ array compute_dynamic_offset(
  if (donate) {
    offset.copy_shared_buffer(indices);
  } else {
-    offset.set_data(cu::malloc_async(offset.itemsize(), encoder));
+    offset.set_data(cu::malloc_async(offset.itemsize(), encoder.stream()));
  }
  encoder.add_temporary(offset);
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -118,7 +118,7 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
-            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
+            cu::malloc_async(x.data_size() * x.itemsize(), encoder.stream()),
            x.data_size(),
            x.strides(),
            x.flags());
--- a/mlx/backend/cuda/sort.cu
+++ b/mlx/backend/cuda/sort.cu
@@ -49,12 +49,14 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
    array trans = swapaxes_in_eval(in, axis, last_dim);
    in = contiguous_copy_gpu(trans, s);
    encoder.add_temporary(in);
-    out =
+    out = array(
-        array(cu::malloc_async(out.nbytes(), encoder), in.shape(), out.dtype());
+        cu::malloc_async(out.nbytes(), encoder.stream()),
        in.shape(),
        out.dtype());
    encoder.add_temporary(out);
  } else {
    out.set_data(
-        cu::malloc_async(in.data_size() * out.itemsize(), encoder),
+        cu::malloc_async(in.data_size() * out.itemsize(), encoder.stream()),
        in.data_size(),
        in.strides(),
        in.flags());
@@ -72,13 +74,17 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
      if (argsort) {
        // Indices in the sorted dimension.
        array indices(
-            cu::malloc_async(out.nbytes(), encoder), in.shape(), out.dtype());
+            cu::malloc_async(out.nbytes(), encoder.stream()),
            in.shape(),
            out.dtype());
        encoder.add_temporary(indices);
        // In argsort though we don't need the result of sorted values, the
        // API requires us to provide an array to store it.
        array discard(
-            cu::malloc_async(in.nbytes(), encoder), in.shape(), in.dtype());
+            cu::malloc_async(in.nbytes(), encoder.stream()),
            in.shape(),
            in.dtype());
        encoder.add_temporary(discard);
        size_t size;
@@ -98,7 +104,9 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            stream));
        array temp(
-            cu::malloc_async(size, encoder), {static_cast<int>(size)}, uint8);
+            cu::malloc_async(size, encoder.stream()),
            {static_cast<int>(size)},
            uint8);
        encoder.add_temporary(temp);
        // Start capturing after allocations
@@ -140,7 +148,9 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            stream));
        array temp(
-            cu::malloc_async(size, encoder), {static_cast<int>(size)}, uint8);
+            cu::malloc_async(size, encoder.stream()),
            {static_cast<int>(size)},
            uint8);
        encoder.add_temporary(temp);
        // Start capturing after allocations
--- a/mlx/backend/cuda/steel/tiles.cuh
+++ b/mlx/backend/cuda/steel/tiles.cuh
@@ -3,10 +3,31 @@
 #pragma once
 #include "mlx/backend/cuda/steel/utils.cuh"
 #include "mlx/backend/cuda/vector_types.cuh"
 namespace mlx::core::cu {
 // Map types to their vector of 2 type float -> float2, double -> double2 etc
 template <typename T>
 struct Vector2;
 template <>
 struct Vector2<double> {
  using type = double2;
 };
 template <>
 struct Vector2<float> {
  using type = float2;
 };
 template <>
 struct Vector2<__half> {
  using type = __half2;
 };
 template <>
 struct Vector2<__nv_bfloat16> {
  using type = __nv_bfloat162;
 };
 template <typename T>
 using Vector2_t = typename Vector2<T>::type;
 /**
 * The basic building block for Ampere mmas. A 16x16 tile distributed across
 * the warp.
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -257,8 +257,9 @@ void ternary_op_gpu(
  auto& c = inputs[2];
  auto topt = get_ternary_op_type(a, b, c);
  auto& encoder = cu::get_command_encoder(s);
-  set_ternary_op_output_data(
+  set_ternary_op_output_data(a, b, c, out, topt, [&](auto n) {
-      a, b, c, out, topt, [&](auto n) { return cu::malloc_async(n, encoder); });
+    return cu::malloc_async(n, encoder.stream());
  });
  ternary_op_gpu_inplace<Op>(inputs, out, s);
 }
--- a/mlx/backend/cuda/unary/unary.cuh
+++ b/mlx/backend/cuda/unary/unary.cuh
@@ -208,8 +208,9 @@ void unary_op_gpu(
    const char* op,
    const Stream& s) {
  auto& encoder = cu::get_command_encoder(s);
-  set_unary_output_data(
+  set_unary_output_data(inputs[0], out, [&](auto n) {
-      inputs[0], out, [&](auto n) { return cu::malloc_async(n, encoder); });
+    return cu::malloc_async(n, encoder.stream());
  });
  unary_op_gpu_inplace<Op>(inputs, out, op, s);
 }
--- a/mlx/backend/cuda/utils.cpp
+++ b/mlx/backend/cuda/utils.cpp
@@ -5,7 +5,6 @@
 #include "mlx/dtype_utils.h"
 #include <fmt/format.h>
 #include <vector>
 namespace mlx::core {
@@ -32,13 +31,6 @@ void check_cuda_error(const char* name, CUresult err) {
  }
 }
 void check_cudnn_error(const char* name, cudnnStatus_t err) {
  if (err != CUDNN_STATUS_SUCCESS) {
    throw std::runtime_error(
        fmt::format("{} failed: {}.", name, cudnnGetErrorString(err)));
  }
 }
 const char* dtype_to_cuda_type(const Dtype& dtype) {
  switch (dtype) {
    case bool_:
@@ -68,7 +60,7 @@ const char* dtype_to_cuda_type(const Dtype& dtype) {
    case float64:
      return "double";
    case complex64:
-      return "mlx::core::cu::complex64_t";
+      return "complex64_t";
    default:
      return "unknown";
  }
@@ -80,6 +72,7 @@ CudaGraph::CudaGraph(cu::Device& device) {
 }
 void CudaGraph::end_capture(cudaStream_t stream) {
  assert(handle_ == nullptr);
  CHECK_CUDA_ERROR(cudaStreamEndCapture(stream, &handle_));
 }
--- a/mlx/backend/cuda/utils.h
+++ b/mlx/backend/cuda/utils.h
@@ -31,10 +31,8 @@ inline T* gpu_ptr(array& arr) {
      arr.offset());
 }
 // For const array, keep constness in pointer unless it is untyped.
 template <typename T>
-inline std::conditional_t<std::is_same_v<T, void>, void*, const T*> gpu_ptr(
+inline const T* gpu_ptr(const array& arr) {
    const array& arr) {
  return gpu_ptr<T>(const_cast<array&>(arr));
 }
--- a/mlx/backend/cuda/vector_types.cuh
+++ b/mlx/backend/cuda/vector_types.cuh
@@ -1,48 +0,0 @@
 // Copyright © 2025 Apple Inc.
 #pragma once
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 namespace mlx::core::cu {
 template <typename T>
 struct Vector2;
 template <>
 struct Vector2<double> {
  using type = double2;
 };
 template <>
 struct Vector2<float> {
  using type = float2;
 };
 template <>
 struct Vector2<__half> {
  using type = __half2;
 };
 template <>
 struct Vector2<__nv_bfloat16> {
  using type = __nv_bfloat162;
 };
 template <typename T>
 using Vector2_t = typename Vector2<T>::type;
 template <typename T>
 struct Vector4 {
  T x, y, z, w;
 };
 template <typename T>
 using Vector4_t = Vector4<T>;
 using bf16x4 = Vector4_t<__nv_bfloat16>;
 using fp16x4 = Vector4_t<__half>;
 using fp32x4 = Vector4_t<float>;
 } // namespace mlx::core::cu
--- a/mlx/backend/gpu/copy.cpp
+++ b/mlx/backend/gpu/copy.cpp
@@ -7,6 +7,8 @@
 namespace mlx::core {
 void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s);
 void copy_gpu(const array& in, array& out, CopyType ctype) {
  copy_gpu(in, out, ctype, out.primitive().stream());
 }
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -28,7 +28,6 @@ make_jit_source(binary_ops)
 make_jit_source(ternary_ops)
 make_jit_source(reduce_utils kernels/atomic.h kernels/reduction/ops.h)
 make_jit_source(indexing/scatter kernels/indexing/indexing.h)
 make_jit_source(indexing/masked_scatter)
 make_jit_source(indexing/gather kernels/indexing/indexing.h)
 make_jit_source(indexing/gather_front kernels/indexing/indexing.h)
 make_jit_source(indexing/gather_axis)
--- a/mlx/backend/metal/allocator.cpp
+++ b/mlx/backend/metal/allocator.cpp
@@ -149,9 +149,7 @@ Buffer MetalAllocator::malloc(size_t size) {
      buf = device_->newBuffer(size, resource_options);
    }
    if (!buf) {
-      std::ostringstream msg;
+      return Buffer{nullptr};
      msg << "[malloc] Unable to allocate " << size << " bytes.";
      throw std::runtime_error(msg.str());
    }
    lk.lock();
    num_resources_++;
@@ -203,32 +201,6 @@ size_t MetalAllocator::size(Buffer buffer) const {
  return static_cast<MTL::Buffer*>(buffer.ptr())->length();
 }
 Buffer MetalAllocator::make_buffer(void* ptr, size_t size) {
  auto buf = device_->newBuffer(ptr, size, resource_options, nullptr);
  if (!buf) {
    return Buffer{nullptr};
  }
  std::unique_lock lk(mutex_);
  residency_set_.insert(buf);
  active_memory_ += buf->length();
  peak_memory_ = std::max(peak_memory_, active_memory_);
  num_resources_++;
  return Buffer{static_cast<void*>(buf)};
 }
 void MetalAllocator::release(Buffer buffer) {
  auto buf = static_cast<MTL::Buffer*>(buffer.ptr());
  if (buf == nullptr) {
    return;
  }
  std::unique_lock lk(mutex_);
  active_memory_ -= buf->length();
  num_resources_--;
  lk.unlock();
  auto pool = metal::new_scoped_memory_pool();
  buf->release();
 }
 MetalAllocator& allocator() {
  // By creating the |allocator_| on heap, the destructor of MetalAllocator
  // will not be called on exit and buffers in the cache will be leaked. This
--- a/mlx/backend/metal/allocator.h
+++ b/mlx/backend/metal/allocator.h
@@ -21,9 +21,6 @@ class MetalAllocator : public allocator::Allocator {
  virtual Buffer malloc(size_t size) override;
  virtual void free(Buffer buffer) override;
  virtual size_t size(Buffer buffer) const override;
  virtual Buffer make_buffer(void* ptr, size_t size) override;
  virtual void release(Buffer buffer) override;
  size_t get_active_memory() {
    return active_memory_;
  };
--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -382,8 +382,11 @@ MTL::CommandQueue* Device::get_queue(Stream stream) {
 bool Device::command_buffer_needs_commit(int index) {
  auto& stream = get_stream_(index);
-  return (stream.buffer_ops > max_ops_per_buffer_) ||
+  if (stream.buffer_ops > max_ops_per_buffer_ ||
-      ((stream.buffer_sizes >> 20) > max_mb_per_buffer_);
+      (stream.buffer_sizes >> 20) > max_mb_per_buffer_) {
    return true;
  }
  return false;
 }
 MTL::CommandBuffer* Device::get_command_buffer(int index) {
--- a/mlx/backend/metal/device.h
+++ b/mlx/backend/metal/device.h
@@ -265,19 +265,4 @@ Device& device(mlx::core::Device);
 std::unique_ptr<void, std::function<void(void*)>> new_scoped_memory_pool();
 inline bool is_nax_available() {
  auto _check_nax = []() {
    bool can_use_nax = false;
    if (__builtin_available(
            macOS 26.2, iOS 26.2, tvOS 26.2, visionOS 26.2, *)) {
      can_use_nax = true;
    }
    can_use_nax &=
        metal::device(mlx::core::Device::gpu).get_architecture_gen() >= 17;
    return can_use_nax;
  };
  static bool is_nax_available_ = _check_nax();
  return is_nax_available_;
 }
 } // namespace mlx::core::metal
--- a/mlx/backend/metal/indexing.cpp
+++ b/mlx/backend/metal/indexing.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2023-2024 Apple Inc.
 #include <fmt/format.h>
 #include "mlx/backend/common/compiled.h"
@@ -9,9 +8,7 @@
 #include "mlx/backend/metal/jit/includes.h"
 #include "mlx/backend/metal/jit/indexing.h"
 #include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/scan.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/dtype.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"
@@ -644,84 +641,4 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  compute_encoder.dispatch_threads(grid_dims, group_dims);
 }
 void MaskedScatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  const array& dst = inputs[0];
  const array& mask = inputs[1];
  const array& src = inputs[2];
  auto& s = stream();
  auto& d = metal::device(s.device);
  const size_t total = mask.size();
  const CopyType ct = (total == 1)
      ? CopyType::Scalar
      : (dst.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  copy_gpu(dst, out, ct, s);
  if (total == 0) {
    return;
  }
  array mask_flat = flatten_in_eval(mask, 1, -1, s);
  if (mask_flat.data<void>() != mask.data<void>()) {
    d.add_temporary(mask_flat, s.index);
  }
  if (!mask_flat.flags().row_contiguous) {
    mask_flat = contiguous_copy_gpu(mask_flat, s);
    d.add_temporary(mask_flat, s.index);
  }
  // Prefix (exclusive) of mask → scatter_offsets
  array scatter_offsets(mask_flat.shape(), uint32, nullptr, {});
  scatter_offsets.set_data(allocator::malloc(scatter_offsets.nbytes()));
  d.add_temporary(scatter_offsets, s.index);
  scan_gpu_inplace(
      mask_flat,
      scatter_offsets,
      Scan::Sum,
      /*axis=*/1,
      /*reverse=*/false,
      /*inclusive=*/false,
      s);
  // Kernel selection/build
  static constexpr std::string_view kBaseName = "masked_assign";
  const std::string dtype_tag = type_to_name(out.dtype());
  const std::string value_type = get_type_string(out.dtype());
  const std::string contiguous =
      (src.flags().row_contiguous) ? "true" : "false";
  const std::string kernel_name =
      fmt::format("{}_{}_{}", kBaseName, dtype_tag, contiguous);
  auto lib = d.get_library(kernel_name, [&]() {
    std::string source = metal::utils();
    source += metal::masked_scatter();
    source += fmt::format(
        std::string(masked_assign_kernel), kernel_name, value_type, contiguous);
    return source;
  });
  auto kernel = d.get_kernel(kernel_name, lib);
  // Binding
  int bind_idx = 0;
  const int ndim = static_cast<int>(src.ndim());
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);
  compute_encoder.set_input_array(mask_flat, bind_idx++);
  compute_encoder.set_input_array(scatter_offsets, bind_idx++);
  compute_encoder.set_input_array(src, bind_idx++);
  compute_encoder.set_output_array(out, bind_idx++);
  compute_encoder.set_vector_bytes(src.shape(), bind_idx++);
  compute_encoder.set_vector_bytes(src.strides(), bind_idx++);
  compute_encoder.set_bytes(ndim, bind_idx++);
  compute_encoder.set_bytes(src.size() / src.shape(0), bind_idx++);
  compute_encoder.set_bytes(mask_flat.size() / mask.shape(0), bind_idx++);
  // Dispatch
  auto group_dims = get_block_dims(total, 1, 1);
  MTL::Size grid_dims(total, 1, 1);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
 }
 } // namespace mlx::core
--- a/mlx/backend/metal/jit/includes.h
+++ b/mlx/backend/metal/jit/includes.h
@@ -11,7 +11,6 @@ const char* ternary_ops();
 const char* reduce_utils();
 const char* gather();
 const char* scatter();
 const char* masked_scatter();
 const char* arange();
 const char* unary();
--- a/mlx/backend/metal/jit/indexing.h
+++ b/mlx/backend/metal/jit/indexing.h
@@ -70,7 +70,3 @@ constexpr std::string_view scatter_kernels = R"(
      gid);
 }}
 )";
 constexpr std::string_view masked_assign_kernel = R"(
 template [[host_name("{0}")]] [[kernel]] decltype(masked_assign_impl<{1}, {2}>) masked_assign_impl<{1}, {2}>;
 )";
--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -9,14 +9,7 @@ set(BASE_HEADERS
    utils.h)
 function(build_kernel_base TARGET SRCFILE DEPS)
-  set(METAL_FLAGS
+  set(METAL_FLAGS -Wall -Wextra -fno-fast-math -Wno-c++17-extensions)
      -x
      metal
      -Wall
      -Wextra
      -fno-fast-math
      -Wno-c++17-extensions
      -Wno-c++20-extensions)
  if(MLX_METAL_DEBUG)
    set(METAL_FLAGS ${METAL_FLAGS} -gline-tables-only -frecord-sources)
  endif()
@@ -127,30 +120,6 @@ if(NOT MLX_METAL_JIT)
  build_kernel(gemv_masked steel/utils.h)
 endif()
 if((MLX_METAL_VERSION GREATER_EQUAL 400) AND (MACOS_SDK_VERSION GREATER_EQUAL
                                              26.2))
  set(STEEL_NAX_HEADERS
      steel/defines.h
      steel/utils.h
      steel/gemm/transforms.h
      steel/gemm/nax.h
      steel/gemm/gemm_nax.h
      steel/utils/type_traits.h
      steel/utils/integral_constant.h)
  build_kernel(steel/gemm/kernels/steel_gemm_fused_nax ${STEEL_NAX_HEADERS})
  build_kernel(steel/gemm/kernels/steel_gemm_gather_nax ${STEEL_NAX_HEADERS})
  build_kernel(quantized_nax quantized_nax.h ${STEEL_NAX_HEADERS})
  build_kernel(fp_quantized_nax fp_quantized_nax.h ${STEEL_NAX_HEADERS})
  set(STEEL_NAX_ATTN_HEADERS
      steel/defines.h steel/utils.h steel/attn/nax.h steel/utils/type_traits.h
      steel/utils/integral_constant.h)
  build_kernel(steel/attn/kernels/steel_attention_nax ${STEEL_NAX_ATTN_HEADERS})
 endif()
 add_custom_command(
  OUTPUT ${MLX_METAL_PATH}/mlx.metallib
  COMMAND xcrun -sdk macosx metallib ${KERNEL_AIR} -o
--- a/mlx/backend/metal/kernels/fp_quantized_nax.h
+++ b/mlx/backend/metal/kernels/fp_quantized_nax.h
--- a/mlx/backend/metal/kernels/fp_quantized_nax.metal
+++ b/mlx/backend/metal/kernels/fp_quantized_nax.metal
@@ -1,74 +0,0 @@
 // Copyright © 2025 Apple Inc.
 // clang-format off
 #include "mlx/backend/metal/kernels/utils.h"
 #include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
 #include "mlx/backend/metal/kernels/quantized_utils.h"
 #include "mlx/backend/metal/kernels/steel/gemm/nax.h"
 #include "mlx/backend/metal/kernels/fp_quantized_nax.h"
 #define instantiate_quantized_batched(mode, name, type, bm, bn, bk, wm, wn, batched) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_32_b_4_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_batch_" #batched, \
      fp_ ## name,  \
      type,         \
      32,           \
      4,            \
      batched)
 #define instantiate_quantized_aligned(mode, name, type, bm, bn, bk, wm, wn, aligned) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_32_b_4_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_alN_" #aligned, \
      fp_ ## name, \
      type,        \
      32,          \
      4,           \
      aligned)
 #define instantiate_quantized_aligned_batched(mode, name, type, bm, bn, bk, wm, wn, aligned, batched) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_32_b_4_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_alN_" #aligned "_batch_" #batched, \
      fp_ ## name,    \
      type,    \
      32,      \
      4,       \
      aligned, \
      batched)
 #define instantiate_gather_qmm_rhs(func, name, type, bm, bn, bk, wm, wn, transpose) \
  instantiate_kernel( \
      #name "_" #type "_gs_32_b_4_bm_" #bm "_bn_" #bn "_bk_" #bk "_wm_" #wm "_wn_" #wn, \
      func,    \
      type,    \
      32,      \
      4,       \
      bm,      \
      bn,      \
      bk,      \
      wm,      \
      wn,      \
      transpose)
 #define instantiate_quantized_all_aligned(type) \
  instantiate_quantized_aligned(mxfp4, gather_qmm_t_nax, type, 64, 64, 64, 2, 2, true)      \
  instantiate_quantized_aligned(mxfp4, gather_qmm_t_nax, type, 64, 64, 64, 2, 2, false)     \
  instantiate_quantized_aligned_batched(mxfp4, qmm_t_nax, type, 64, 64, 64, 2, 2, true, 1)  \
  instantiate_quantized_aligned_batched(mxfp4, qmm_t_nax, type, 64, 64, 64, 2, 2, true, 0)  \
  instantiate_quantized_aligned_batched(mxfp4, qmm_t_nax, type, 64, 64, 64, 2, 2, false, 1) \
  instantiate_quantized_aligned_batched(mxfp4, qmm_t_nax, type, 64, 64, 64, 2, 2, false, 0)
 #define instantiate_quantized_all_rhs(type) \
  instantiate_gather_qmm_rhs(fp_gather_qmm_rhs_nax, mxfp4_gather_qmm_rhs_nax_nt, type, 64, 64, 64, 2, 2, true) \
  instantiate_gather_qmm_rhs(fp_gather_qmm_rhs_nax, mxfp4_gather_qmm_rhs_nax_nn, type, 64, 64, 64, 2, 2, false) 
 #define instantiate_quantized_types(type) \
  instantiate_quantized_all_aligned(type) \
  instantiate_quantized_all_rhs(type)
 instantiate_quantized_types(float)
 instantiate_quantized_types(bfloat16_t)
 instantiate_quantized_types(float16_t)
    // clang-format on
--- a/mlx/backend/metal/kernels/indexing/masked_scatter.h
+++ b/mlx/backend/metal/kernels/indexing/masked_scatter.h
@@ -1,38 +0,0 @@
 // Copyright © 2025 Apple Inc.
 #pragma once
 template <typename T, bool src_contiguous>
 [[kernel]] void masked_assign_impl(
    const device bool* mask [[buffer(0)]],
    const device uint* scatter_offsets [[buffer(1)]],
    const device T* src [[buffer(2)]],
    device T* out [[buffer(3)]],
    const constant int* src_shapes [[buffer(4)]],
    const constant int64_t* src_strides [[buffer(5)]],
    const constant int& src_ndim [[buffer(6)]],
    const constant int64_t& src_batch_size [[buffer(7)]],
    const constant int64_t& mask_batch_size [[buffer(8)]],
    uint idx [[thread_position_in_grid]]) {
  const bool mask_value = mask[idx];
  if (!mask_value) {
    return;
  }
  const uint src_index = scatter_offsets[idx];
  if (src_index >= src_batch_size) {
    return;
  }
  const uint batch_idx = idx / mask_batch_size;
  if (src_contiguous) {
    out[idx] = src[batch_idx * src_batch_size + src_index];
  } else {
    out[idx] = src[elem_to_loc<uint>(
        batch_idx * src_batch_size + src_index,
        src_shapes,
        src_strides,
        src_ndim)];
  }
 }
--- a/Show More
+++ b/Show More