CMakeLists.txt update

WIP (python)
WIP
2025-12-16 01:49:05 +08:00 · 2025-10-31 16:55:04 -07:00 · 2025-10-31 16:24:51 -07:00 · 2025-10-31 16:24:35 -07:00 · 2025-10-31 16:24:21 -07:00 · 2025-10-31 16:24:09 -07:00
312 changed files with 4880 additions and 17285 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -0,0 +1,579 @@
+version: 2.1
+
+orbs:
+  apple: ml-explore/pr-approval@0.1.0
+
+parameters:
+  nightly_build:
+    type: boolean
+    default: false
+  test_release:
+    type: boolean
+    default: false
+
+jobs:
+  build_documentation:
+    parameters:
+      upload-docs:
+        type: boolean
+        default: false
+    macos:
+      xcode: "26.0.0"
+    resource_class: m4pro.medium
+    steps:
+      - checkout
+      - run:
+          name: Install
+          command: |
+            xcodebuild -downloadComponent MetalToolchain
+            brew install python@3.10
+            brew install doxygen
+            python3.10 -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install --upgrade cmake
+            pip install -r docs/requirements.txt
+            pip install . -v
+      - when:
+          condition:
+            not: << parameters.upload-docs >>
+          steps:
+            - run:
+               name: Build documentation
+               command: |
+                 source env/bin/activate
+                 cd docs && doxygen && make html O=-W
+      - when:
+          condition: << parameters.upload-docs >>
+          steps:
+            - add_ssh_keys:
+                fingerprints:
+                  - "SHA256:OhcVVMovbT0pkgMeiVRyxMnjV9R2t+hKBsNcuxq9h+0"
+            - run:
+               name: Upload documentation
+               command: |
+                 source env/bin/activate
+                 git config user.email "mlx@group.apple.com"
+                 git config user.name "CircleCI Docs"
+                 git checkout gh-pages
+                 git rebase main
+                 cd docs
+                 git rm -rf build/html
+                 doxygen && make html O=-W
+                 git add -f build/html
+                 git commit -m "rebase"
+                 git push -f origin gh-pages
+
+  linux_build_and_test:
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Run style checks
+          command: |
+            pip install pre-commit
+            pre-commit run --all
+            if ! git diff --quiet; then echo 'Style checks failed, please install pre-commit and run pre-commit run --all and push the change'; exit 1; fi
+      - run:
+          name: Install dependencies
+          command: |
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            sudo apt-get update
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
+            curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run:
+          name: Install Python package
+          command: |
+            uv venv
+            uv pip install cmake
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+              uv pip install -e ".[dev]" -v
+      - run:
+          name: Generate package stubs
+          command: |
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            python -m unittest discover python/tests -v
+            mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            mkdir -p build && cd build
+            cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
+            make -j `nproc`
+      - run:
+          name: Run CPP tests
+          command: ./build/tests/tests
+
+  mac_build_and_test:
+    parameters:
+      xcode_version:
+        type: string
+        default: "26.0.0"
+      macosx_deployment_target:
+        type: string
+        default: ""
+    macos:
+      xcode: << parameters.xcode_version >>
+    environment:
+      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
+    resource_class: m4pro.medium
+    steps:
+      - checkout
+      - run:
+          name: Install dependencies
+          command: |
+            xcodebuild -downloadComponent MetalToolchain
+            HOMEBREW_NO_AUTO_UPDATE=1 HOMEBREW_NO_INSTALL_CLEANUP=1 \
+              brew install openmpi uv
+      - run:
+          name: Install Python package
+          command: |
+            uv venv --python 3.10
+            uv pip install \
+              nanobind==2.4.0 \
+              cmake \
+              numpy \
+              torch \
+              tensorflow \
+              unittest-xml-reporting
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+              uv pip install -e . -v
+      - run:
+          name: Generate package stubs
+          command: |
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
+            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
+            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
+      - run:
+          name: Build example extension
+          command: |
+            source .venv/bin/activate
+            cd examples/extensions
+            uv pip install -r requirements.txt
+            uv run --no-project setup.py build_ext --inplace
+            uv run --no-project python test.py
+      - store_test_results:
+          path: test-results
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
+      - run:
+          name: Run CPP tests
+          command: |
+            DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests
+      - run:
+          name: Build small binary
+          command: |
+            source .venv/bin/activate
+            cd build/
+            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
+              -DBUILD_SHARED_LIBS=ON \
+              -DMLX_BUILD_CPU=OFF \
+              -DMLX_BUILD_SAFETENSORS=OFF \
+              -DMLX_BUILD_GGUF=OFF \
+              -DMLX_METAL_JIT=ON
+            make -j `sysctl -n hw.ncpu`
+      - run:
+          name: Run Python tests with JIT
+          command: |
+            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+              uv pip install -e . -v
+            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
+              METAL_DEBUG_ERROR_MODE=0 \
+              uv run --no-project python -m xmlrunner discover \
+                -v python/tests \
+                -o test-results/gpu_jit
+
+  cuda_build_and_test:
+    parameters:
+      image_date:
+        type: string
+        default: "2023.11.1"
+    machine:
+      image: "linux-cuda-12:<< parameters.image_date >>"
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - restore_cache:
+          keys:
+            - cuda-<< parameters.image_date >>-{{ arch }}-
+      - run:
+          name: Install dependencies
+          command: |
+            sudo apt-get update
+            sudo apt-get install libcudnn9-dev-cuda-12
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install libnccl2 libnccl-dev
+            curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
+            sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
+            rm -rf ccache-4.11.3-linux-x86_64
+            curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run:
+          name: Set CCache size
+          command: ccache --max-size 1G
+      - run:
+          name: Install Python package
+          command: |
+            uv venv
+            uv pip install cmake
+            DEBUG=1 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              uv pip install -e ".[dev]" -v
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
+            LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            cmake . -B build \
+              -DMLX_BUILD_CUDA=ON \
+              -DCMAKE_CUDA_COMPILER=`which nvcc` \
+              -DCMAKE_BUILD_TYPE=DEBUG
+            cmake --build build -j `nproc`
+      - run:
+          name: Run CPP tests
+          command: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
+      - run:
+          name: CCache report
+          command: |
+            ccache --show-stats
+            ccache --zero-stats
+            ccache --cleanup
+      - save_cache:
+          key: cuda-<< parameters.image_date >>-{{ arch }}-{{ epoch }}
+          paths:
+            - /home/circleci/.cache/ccache
+
+  build_release:
+    parameters:
+      python_version:
+        type: string
+        default: "3.10"
+      xcode_version:
+        type: string
+        default: "26.0.0"
+      build_env:
+        type: string
+        default: ""
+      macosx_deployment_target:
+        type: string
+        default: ""
+    macos:
+      xcode: << parameters.xcode_version >>
+    resource_class: m4pro.medium
+    environment:
+      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
+    steps:
+      - checkout
+      - run:
+          name: Install dependencies
+          command: |
+            xcodebuild -downloadComponent MetalToolchain
+            mkdir -p ~/miniconda3
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
+            bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+            rm ~/miniconda3/miniconda.sh
+            source ~/miniconda3/bin/activate
+            conda init --all
+            conda create -n env python=<< parameters.python_version >> -y
+            conda activate env
+            pip install --upgrade cmake
+            pip install nanobind==2.4.0
+            pip install --upgrade setuptools
+            pip install numpy
+            pip install twine
+            pip install build
+      - run:
+          name: Install Python package
+          command: |
+            conda activate env
+            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
+              pip install . -v
+      - run:
+          name: Generate package stubs
+          command: |
+            conda activate env
+            pip install typing_extensions
+            python setup.py generate_stubs
+      - run:
+          name: Build Python package
+          command: |
+            conda activate env
+            python setup.py clean --all
+            << parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
+      - when:
+          condition:
+            equal: ["3.10", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  conda activate env
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  conda activate env
+                  twine upload dist/*
+      - store_artifacts:
+          path: dist/
+
+  build_linux_release:
+    parameters:
+      python_version:
+        type: string
+        default: "3.10"
+      build_env:
+        type: string
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Build wheel
+          command: |
+            PYTHON=python<< parameters.python_version >>
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            sudo apt-get update
+            TZ=Etc/UTC sudo apt-get -y install tzdata
+            sudo add-apt-repository -y ppa:deadsnakes/ppa
+            sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            $PYTHON -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install --upgrade cmake
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            << parameters.build_env >> pip install ".[dev]" -v
+            pip install typing_extensions
+            python setup.py generate_stubs
+            python setup.py clean --all
+            MLX_BUILD_STAGE=1 << parameters.build_env >> python -m build -w
+            bash python/scripts/repair_linux.sh
+      - when:
+          condition:
+            equal: ["3.10", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  source env/bin/activate
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 \
+                    python -m build -w
+                  auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload packages
+                command: |
+                  source env/bin/activate
+                  twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+
+  build_cuda_release:
+    parameters:
+      build_env:
+        type: string
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: xlarge
+    steps:
+      - checkout
+      - run:
+          name: Build wheel
+          command: |
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo apt-get update
+            sudo apt-get install cuda-toolkit-12-9 libcudnn9-dev-cuda-12
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install zip
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
+            export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+            << parameters.build_env >> MLX_BUILD_STAGE=2 \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              python -m build -w
+            bash python/scripts/repair_cuda.sh
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+
+workflows:
+  build_and_test:
+    when:
+      and:
+        - matches:
+            pattern: "^(?!pull/)[-\\w]+$"
+            value: << pipeline.git.branch >>
+        - not: << pipeline.parameters.nightly_build >>
+        - not: << pipeline.parameters.test_release >>
+    jobs:
+      - mac_build_and_test:
+          matrix:
+            parameters:
+              macosx_deployment_target: ["13.5", "15.0"]
+      - linux_build_and_test
+      - cuda_build_and_test:
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
+      - build_documentation 
+
+  build_pypi_release:
+    when:
+      and:
+        - not: << pipeline.parameters.nightly_build >>
+        - not: << pipeline.parameters.test_release >>
+    jobs:
+      - build_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              build_env: ["PYPI_RELEASE=1"]
+              xcode_version: ["26.0.0"]
+      - build_documentation:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          upload-docs: true
+      - build_linux_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              build_env: ["PYPI_RELEASE=1"]
+      - build_cuda_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              build_env: ["PYPI_RELEASE=1"]
+
+  prb:
+    when:
+      matches:
+        pattern: "^pull/\\d+(/head)?$"
+        value: << pipeline.git.branch >>
+    jobs:
+      - hold:
+          type: approval
+      - apple/authenticate:
+          context: pr-approval
+      - mac_build_and_test:
+          requires: [ hold ]
+          matrix:
+            parameters:
+              macosx_deployment_target: ["13.5", "15.0"]
+      - linux_build_and_test:
+          requires: [ hold ]
+      - cuda_build_and_test:
+          requires: [ hold ]
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
+  nightly_build:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.nightly_build >>
+    jobs:
+      - build_release:
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              xcode_version: ["26.0.0"]
+      - build_linux_release:
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+      - build_cuda_release
+
+  build_dev_release:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.test_release >>
+    jobs:
+      - build_release:
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              build_env: ["DEV_RELEASE=1"]
+              xcode_version: ["26.0.0"]
+      - build_linux_release:
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              build_env: ["DEV_RELEASE=1"]
+      - build_cuda_release:
+          matrix:
+            parameters:
+              build_env: ["DEV_RELEASE=1"]
--- a/.github/actions/build-cuda-release/action.yml
+++ b/.github/actions/build-cuda-release/action.yml
@@ -1,24 +0,0 @@
-name: 'Build CUDA wheel'
-description: 'Build CUDA wheel'
-
-inputs:
-  arch:
-    description: 'Platform architecture tag'
-    required: true
-    type: choice
-    options:
-      - x86_64
-      - aarch64
-
-runs:
-  using: "composite"
-  steps:
-    - name: Build package
-      shell: bash
-      env:
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON
-      run: |
-        pip install auditwheel build patchelf setuptools
-        python setup.py clean --all
-        MLX_BUILD_STAGE=2 python -m build -w
-        bash python/scripts/repair_cuda.sh ${{ inputs.arch }}
--- a/.github/actions/build-docs/action.yml
+++ b/.github/actions/build-docs/action.yml
@@ -1,38 +0,0 @@
-name: 'Build Documentation'
-description: 'Build documentation'
-
-runs:
-  using: "composite"
-  steps:
-    - name: Setup machine
-      uses: ./.github/actions/setup-linux
-
-    - name: Install dependencies
-      shell: bash
-      run: |
-        sudo apt-get install -y doxygen
-        source .venv/bin/activate
-        pip install -r docs/requirements.txt
-        pip install . -v
-  
-    - name: Build documentation
-      shell: bash
-      run: |
-        source .venv/bin/activate
-        cd docs
-        doxygen
-        make html O=-W
-    
-    - name: Create artifact tar
-      shell: bash
-      run: tar -cf artifact.tar -C docs --dereference build/html index.html
-
-    # Do it manually because upload-pages-artifact requires gtar
-    - name: Upload artifact
-      id: upload-artifact
-      uses: actions/upload-artifact@v5
-      with:
-        name: github-pages
-        path: artifact.tar
-        retention-days: 1
-        if-no-files-found: error
--- a/.github/actions/build-linux-release/action.yml
+++ b/.github/actions/build-linux-release/action.yml
@@ -1,40 +0,0 @@
-name: 'Build Linux wheel'
-description: 'Build Linux wheel'
-
-inputs:
-  build-backend:
-    description: 'Build the backend mlx-cpu package'
-    type: boolean
-    required: false
-    default: false
-  arch:
-    description: 'Platform architecture tag'
-    required: true
-    type: choice
-    options:
-      - x86_64
-      - aarch64
-
-runs:
-  using: "composite"
-  steps:
-    - name: Generate package stubs
-      shell: bash
-      run: |
-        pip install -e ".[dev]" -v
-        pip install typing_extensions
-        python setup.py generate_stubs
-    - name: Build Python package
-      shell: bash
-      run: |
-        pip install auditwheel patchelf build
-        python setup.py clean --all
-        MLX_BUILD_STAGE=1 python -m build -w
-        bash python/scripts/repair_linux.sh ${{ inputs.arch }}
-    - name: Build backend package
-      if: ${{ inputs.build-backend }}
-      shell: bash
-      run: |
-        python setup.py clean --all
-        MLX_BUILD_STAGE=2 python -m build -w
-        auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_${{ inputs.arch }}
--- a/.github/actions/build-linux/action.yml
+++ b/.github/actions/build-linux/action.yml
@@ -1,41 +0,0 @@
-name: 'Build and Test on Linux'
-
-inputs:
-  toolkit:
-    description: 'The toolkit to build with'
-    required: false
-    default: 'cpu'
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Python package
-      id: python_build
-      shell: sh
-      env:
-        DEBUG: 1
-        CMAKE_ARGS: >-
-          -DCMAKE_COMPILE_WARNING_AS_ERROR=ON
-          -DMLX_BUILD_CUDA=${{ startsWith(inputs.toolkit, 'cuda') && 'ON' || 'OFF' }}
-      run: |
-        if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then
-          # There is no GPU in arm64 runner, use a common arch.
-          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=90a"
-          # Can not build tests when the built executables can not run.
-          CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF"
-        fi
-        pip install --no-build-isolation -e ".[dev]" -v
-        # Pass the CMAKE_ARGS to following steps.
-        echo CMAKE_ARGS="$CMAKE_ARGS" >> $GITHUB_OUTPUT
-
-    - name: Generate package stubs
-      shell: sh
-      run: |
-        pip install typing_extensions
-        python setup.py generate_stubs
-
-    - name: Build CPP only
-      shell: bash
-      run: |
-        cmake . -B build -DCMAKE_BUILD_TYPE=Debug ${{ steps.python_build.outputs.CMAKE_ARGS }}
-        cmake --build build -j $(nproc)
--- a/.github/actions/build-macos-release/action.yml
+++ b/.github/actions/build-macos-release/action.yml
@@ -1,34 +0,0 @@
-name: 'Build macOS release'
-description: 'Build MLX releases macOS'
-
-inputs:
-  macos-target:
-    description: 'macOS build target'
-    required: false
-    default: '15.0'
-  build-backend:
-    description: 'Build the backend mlx-metal package'
-    type: boolean
-    required: false
-    default: false
-
-runs:
-  using: "composite"
-  steps:
-    - name: Build Python package
-      shell: bash -l {0}
-      env:
-        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
-      run: |
-        pip install build
-        python setup.py clean --all
-        MLX_BUILD_STAGE=1 python -m build -w
-
-    - name: Build backend package
-      if: ${{ inputs.build-backend }}
-      shell: bash -l {0}
-      env:
-        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
-      run: |
-        python setup.py clean --all
-        MLX_BUILD_STAGE=2 python -m build -w
--- a/.github/actions/build-macos/action.yml
+++ b/.github/actions/build-macos/action.yml
@@ -1,88 +0,0 @@
-name: 'Build and Test on macOS'
-description: 'Build and test MLX on macOS'
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install dependencies
-      env:
-        DEBUG: 1
-        CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
-      shell: bash -l {0}
-      run: |
-        pip install --upgrade pip
-        pip install cmake setuptools nanobind==2.10.2
-        pip install -e . -v
-
-    - name: Generate package stubs
-      shell: bash -l {0}
-      run: |
-        pip install typing_extensions
-        python setup.py generate_stubs
-
-    - name: Install tests dependencies
-      shell: bash -l {0}
-      run: |
-        pip install numpy torch tensorflow unittest-xml-reporting
-
-    - name: Run Python tests
-      shell: bash -l {0}
-      env:
-        LOW_MEMORY: 1
-      run: |
-        DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
-        DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
-        mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
-        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
-        if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
-    
-    - name: Build example extension
-      shell: bash -l {0}
-      run: |
-        cd examples/extensions
-        pip install -r requirements.txt
-        python setup.py build_ext --inplace
-        python test.py
-    
-    - name: Build CPP only
-      shell: bash -l {0}
-      run: |
-        mkdir -p build
-        cd build
-        cmake ..
-        make -j $(sysctl -n hw.ncpu)
-    
-    - name: Run CPP tests
-      shell: bash -l {0}
-      env:
-        DEVICE: gpu
-        METAL_DEVICE_WRAPPER_TYPE: 1
-        METAL_DEBUG_ERROR_MODE: 0
-      run: ./build/tests/tests
-    
-    - name: Build small binary with JIT
-      shell: bash -l {0}
-      run: |
-        mkdir -p build
-        cd build
-        cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
-          -DBUILD_SHARED_LIBS=ON \
-          -DMLX_BUILD_CPU=OFF \
-          -DMLX_BUILD_SAFETENSORS=OFF \
-          -DMLX_BUILD_GGUF=OFF \
-          -DMLX_METAL_JIT=ON
-        make -j $(sysctl -n hw.ncpu)
-    
-    - name: Run Python tests with JIT
-      shell: bash -l {0}
-      env:
-        LOW_MEMORY: 1
-        DEVICE: gpu
-        METAL_DEVICE_WRAPPER_TYPE: 1
-        METAL_DEBUG_ERROR_MODE: 0
-      run: |
-        CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
-          pip install -e . -v
-        python -m xmlrunner discover \
-            -v python/tests \
-            -o test-results/gpu_jit
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -1,93 +0,0 @@
-name: 'Setup Linux Environment'
-description: 'Install dependencies for Linux builds'
-
-inputs:
-  toolkit:
-    description: 'Which toolkit to install'
-    required: false
-    default: 'cpu'
-  python-version:
-    description: 'Version of python to set up'
-    required: false
-    default: '3.10'
-  use-ccache:
-    description: 'Whether to enable ccache'
-    required: false
-    default: 'true'
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install common dependencies
-      shell: bash
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev zip
-
-    - name: Use ccache
-      if: ${{ inputs.use-ccache == 'true' }}
-      uses: hendrikmuhs/ccache-action@v1.2
-      with:
-        key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}
-        max-size: 1GB
-        # ccache-action bug: running "apt-get update" fails on large arm runner.
-        update-package-index: false
-
-    - uses: actions/setup-python@v6
-      with:
-        python-version: ${{ inputs.python-version }}
-
-    - name: Setup Python venv
-      shell: bash
-      run: |
-        python -m venv .venv
-        source .venv/bin/activate
-        pip install setuptools cmake nanobind==2.10.2
-        echo PATH=$PATH >> $GITHUB_ENV
-        # Make cmake search .venv for nanobind
-        echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV
-
-    - name: Install MPI
-      shell: bash
-      run: sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev
-
-    - name: Install CUDA toolkit
-      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
-      shell: bash
-      env:
-        # Note: the CI machine does not meet CUDA 13's driver requirement.
-        # Compatibility matrix:
-        # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
-        PACKAGES: |
-          {
-            "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6",
-            "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-toolkit-12-9",
-            "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0"
-          }
-      run: |
-        # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is
-        # Jetson specific. SBSA means Arm Server Base System Architecture.
-        ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }}
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
-        sudo dpkg -i cuda-keyring_1.1-1_all.deb
-        sudo apt-get update
-        sudo apt-get install -y \
-            libnccl2 libnccl-dev \
-            ${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
-        echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH
-
-    - name: CUDA packages and driver report
-      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
-      shell: bash
-      run: |
-        sudo apt-get install -y ubuntu-drivers-common dkms
-        echo "NVIDIA Driver Packages Available:"
-        sudo ubuntu-drivers list --gpgpu
-        echo "NVIDIA Driver Version:"
-        cat /proc/driver/nvidia/version || echo "nvidia driver not found"
-        echo "Installed NVIDIA and CUDA packages:"
-        dpkg -l | egrep "cuda|nvidia" -i
-        echo "DKMS Status:"
-        dkms status || echo "dkms not found"
-        echo "NVIDIA-SMI Status:"
-        nvidia-smi || echo "nvidia-smi not found"
--- a/.github/actions/setup-macos/action.yml
+++ b/.github/actions/setup-macos/action.yml
@@ -1,24 +0,0 @@
-name: 'Setup macOS Environment'
-description: 'Install dependencies for macOS builds'
-
-inputs:
-  python-version:
-    description: 'Python version to use'
-    required: false
-    default: '3.10'
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Homebrew packages
-      shell: sh
-      run: /opt/homebrew/bin/brew install openmpi
-    
-    - name: Verify MetalToolchain installed
-      shell: bash
-      run: xcodebuild -showComponent MetalToolchain
-
-    - uses: conda-incubator/setup-miniconda@v3
-      with:
-        miniconda-version: "latest"
-        python-version: ${{ inputs.python-version }}
--- a/.github/actions/test-linux/action.yml
+++ b/.github/actions/test-linux/action.yml
@@ -1,69 +0,0 @@
-name: 'Run Linux tests'
-
-inputs:
-  has-gpu:
-    description: 'Run GPU tests'
-    required: false
-    default: false
-
-runs:
-  using: "composite"
-  steps:
-    - name: Run MPI tests
-      shell: bash
-      run: |
-        echo "::group::MPI tests"
-        mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
-        echo "::endgroup::"
-
-    - name: Run distributed tests
-      if: ${{ inputs.has-gpu == 'false' }}
-      shell: bash
-      run: |
-        echo "::group::Distributed tests"
-        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
-        if grep -Fq '[WARN]' stderr.log ; then
-          grep -F '[WARN]' stderr.log
-          echo "Distributed ring test failed";
-          exit 1;
-        fi
-        echo "::endgroup::"
-
-    - name: Run Python tests - CPU
-      if: ${{ inputs.has-gpu == 'false' }}
-      shell: bash
-      env:
-        DEVICE: cpu
-      run: |
-        echo "::group::Python tests - CPU"
-        python -m unittest discover python/tests -v
-        echo "::endgroup::"
-
-    - name: Run Python tests - GPU
-      if: ${{ inputs.has-gpu == 'true' }}
-      shell: bash
-      env:
-        DEVICE: gpu
-      run: |
-        echo "::group::Python tests - GPU"
-        python -m tests discover python/tests -v
-        echo "::endgroup::"
-
-    - name: Run CPP tests - CPU
-      shell: bash
-      env:
-        DEVICE: cpu
-      run: |
-        echo "::group::CPP tests - CPU"
-        ./build/tests/tests
-        echo "::endgroup::"
-
-    - name: Run CPP tests - GPU
-      if: ${{ inputs.has-gpu == 'true' }}
-      shell: bash
-      env:
-        DEVICE: gpu
-      run: |
-        echo "::group::CPP tests - GPU"
-        ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
-        echo "::endgroup::"
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -1,6 +0,0 @@
-version: 2
-updates:
-  - package-ecosystem: "github-actions"
-    directory: "/"
-    schedule:
-      interval: "weekly"
--- a/.github/scripts/setup+build-cpp-linux-fedora-container.sh
+++ b/.github/scripts/setup+build-cpp-linux-fedora-container.sh
@@ -1,27 +0,0 @@
-#!/bin/bash
-set -ex
-
-# [Setup] Install dependencies inside the container.
-dnf update -y
-dnf install -y \
-  blas-devel \
-  lapack-devel \
-  openblas-devel \
-  make \
-  cmake \
-  clang \
-  git
-dnf clean all
-
-# [C++] CI Build Sanity Check: Verifies code compilation, not for release.
-export CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
-export DEBUG=1
-export CMAKE_C_COMPILER=/usr/bin/clang
-export CMAKE_CXX_COMPILER=/usr/bin/clang++
-
-mkdir -p build
-pushd build
-cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
-make -j $(nproc)
-./tests/tests
-popd
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,108 +0,0 @@
-name: Build and Test
-
-on:
-  pull_request:
-  push:
-    branches:
-      - main
-      # For testing CI without starting a pull request:
-      - test/*
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-jobs:
-  check_lint:
-    name: Check Lint
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v6
-      - uses: pre-commit/action@v3.0.1
-
-  linux_build_and_test:
-    name: Linux (cpu, ${{ matrix.arch }})
-    needs: check_lint
-    strategy:
-      fail-fast: false
-      matrix:
-        arch: ['x86_64', 'aarch64']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-linux
-      - uses: ./.github/actions/build-linux
-      - uses: ./.github/actions/test-linux
-
-  cuda_build_and_test:
-    name: Linux (${{ matrix.toolkit }}, ${{ matrix.arch }})
-    if: github.repository == 'ml-explore/mlx'
-    needs: check_lint
-    strategy:
-      fail-fast: false
-      matrix:
-        arch: ['x86_64', 'aarch64']
-        toolkit: ['cuda-12.6', 'cuda-12.9']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'gpu-t4-4-core' || 'ubuntu-22.04-arm' }}
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-linux
-        with:
-          toolkit: ${{ matrix.toolkit }}
-      - uses: ./.github/actions/build-linux
-        with:
-          toolkit: ${{ matrix.toolkit }}
-      - uses: ./.github/actions/test-linux
-        if: matrix.arch == 'x86_64'
-        with:
-          has-gpu: true
-
-  mac_build_and_test:
-    name: macOS (${{ matrix.macos-target }})
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      matrix:
-        macos-target: ["14.0", "15.0"]
-    runs-on: [self-hosted, macos]
-    env:
-      MACOSX_DEPLOYMENT_TARGET: ${{ matrix.macos-target }}
-    needs: check_lint
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-macos
-      - uses: ./.github/actions/build-macos
-
-  build_documentation:
-    name: Build Documentation
-    if: github.repository == 'ml-explore/mlx'
-    runs-on: ubuntu-22.04
-    needs: check_lint
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/build-docs
-
-  linux_fedora_build_cpp:
-    name: Linux Fedora (${{ matrix.arch }})
-    needs: check_lint
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - host: ubuntu-22.04
-            arch: x86_64
-          - host: ubuntu-22.04-arm
-            arch: aarch64
-
-    runs-on: ${{ matrix.host }}
-    container:
-      image: fedora:42
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-
-      - name: CPP Build Test - No Release
-        run: |
-          bash ./.github/scripts/setup+build-cpp-linux-fedora-container.sh
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -1,28 +0,0 @@
-name: Documentation
-
-on:
-  workflow_dispatch:
-
-permissions:
-  contents: read
-
-jobs:
-  build:
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/build-docs
-      
-  deploy:
-    needs: build
-    permissions:
-      pages: write
-      id-token: write
-    runs-on: ubuntu-latest
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
-    steps:
-      - name: Deploy to GitHub Pages
-        id: deployment
-        uses: actions/deploy-pages@v4
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -1,96 +0,0 @@
-name: Nightly Build
-
-on:
-  schedule:
-    - cron: 33 6 * * 1-5
-  workflow_dispatch:
-
-permissions:
-  contents: read
-
-jobs:
-  build_linux_release:
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.10", "3.14"]
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-linux
-      - uses: ./.github/actions/build-linux-release
-        with:
-          build-backend: ${{ matrix.python-version == '3.10' }}
-          arch: "x86_64"
-      - name: Upload mlx artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          name: linux-wheels-${{ matrix.python_version }}
-          path: wheelhouse/mlx-*.whl
-          retention-days: 7
-      - name: Upload mlx-cpu artifacts
-        if: matrix.python_version == '3.10'
-        uses: actions/upload-artifact@v6
-        with:
-          name: mlx-cpu
-          path: wheelhouse/mlx_cpu-*.whl
-          retention-days: 7
-
-  build_linux_with_tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.11", "3.12", "3.13", "3.14"]
-        runner:
-          - ubuntu-22.04
-          - ubuntu-22.04-arm
-    runs-on: ${{ matrix.runner }}
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-linux
-        with:
-          python-version: ${{ matrix.python_version }}
-      - uses: ./.github/actions/build-linux
-      - uses: ./.github/actions/test-linux
-
-  build_mac_release:
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      matrix:
-        python-version: ["3.10", "3.13"]
-    runs-on: [self-hosted, macos]
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-macos
-        with:
-          python-version: ${{ matrix.python-version }}
-      - uses: ./.github/actions/build-macos
-      - name: Build macOS 15 package
-        uses: ./.github/actions/build-macos-release
-        with:
-          macos-target: 15.0
-          build-backend: ${{ matrix.python-version == '3.10' }}
-      - name: Build macOS 14 package
-        uses: ./.github/actions/build-macos-release
-        with:
-          macos-target: 14.0
-          build-backend: ${{ matrix.python-version == '3.10' }}
-
-  build_cuda_release:
-    if: github.repository == 'ml-explore/mlx'
-    runs-on: ubuntu-22-large
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-linux
-        with:
-          toolkit: 'cuda-12.9'
-      - name: Build Python package
-        uses: ./.github/actions/build-cuda-release
-        with:
-          toolkit: 'cuda-12.9'
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          name: mlx-cuda
-          path: wheelhouse/mlx_cuda-*.whl
-          retention-days: 7
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -0,0 +1,20 @@
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  check_lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pre-commit black isort clang-format
+      - name: Run lint
+        run: |
+          pre-commit run --all-files
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,246 +0,0 @@
-name: PyPI Release
-
-on:
-  push:
-    tags:
-      - 'v*'
-  workflow_dispatch:
-    inputs:
-      dev_release:
-        description: "Do a dev release or regular release"
-        required: true
-        default: "false"
-
-permissions:
-  contents: read
-
-jobs:
-  setup:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Set publishing variables
-        run: echo "Publishing setup complete"
-
-  build_documentation:
-    if: github.repository == 'ml-explore/mlx'
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/build-docs
-    
-  deploy_documentation:
-    needs: build_documentation
-    permissions:
-      pages: write
-      id-token: write
-    runs-on: ubuntu-latest
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
-    steps:
-      - name: Deploy to GitHub Pages
-        id: deployment
-        uses: actions/deploy-pages@v4
-
-  build_linux_release:
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      matrix:
-        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
-        arch: ['x86_64', 'aarch64']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
-    env:
-      PYPI_RELEASE: 1
-      DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-linux
-        with:
-          python-version: ${{ matrix.python_version }}
-          use-ccache: false
-      - uses: ./.github/actions/build-linux-release
-        with:
-          build-backend: ${{ matrix.python-version == '3.10' }}
-          arch: ${{ matrix.arch }}
-      - name: Upload MLX artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          overwrite: true
-          name: linux-wheels-${{ matrix.python_version }}-${{ matrix.arch }}
-          path: wheelhouse/mlx-*.whl
-      - name: Upload CPU artifacts
-        if: matrix.python_version == '3.10'
-        uses: actions/upload-artifact@v6
-        with:
-          overwrite: true
-          name: mlx-cpu-${{ matrix.arch }}
-          path: wheelhouse/mlx_cpu-*.whl
-  
-  build_mac_release:
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
-    runs-on: [self-hosted, macos]
-    env:
-      PYPI_RELEASE: 1
-      DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
-
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-macos
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Install dependencies
-        shell: bash -l {0}
-        run: |
-          pip install --upgrade pip
-          pip install cmake setuptools nanobind==2.10.2
-          pip install -e . -v
-      - name: Generate package stubs
-        shell: bash -l {0}
-        run: |
-          pip install typing_extensions
-          python setup.py generate_stubs
-      - name: Build macOS 14 package
-        uses: ./.github/actions/build-macos-release
-        with:
-          macos-target: 14.0
-          build-backend: ${{ matrix.python-version == '3.10' }}
-      - name: Build macOS 15 package
-        uses: ./.github/actions/build-macos-release
-        with:
-          macos-target: 15.0
-          build-backend: ${{ matrix.python-version == '3.10' }}
-      - name: Upload MLX artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          overwrite: true
-          name: mac-wheels-${{ matrix.python-version }}
-          path: dist/mlx-*.whl
-      - name: Upload Metal artifacts
-        if: matrix.python-version == '3.10'
-        uses: actions/upload-artifact@v6
-        with:
-          overwrite: true
-          name: mlx-metal
-          path: dist/mlx_metal-*.whl
-
-  build_cuda_release:
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      matrix:
-        arch: ['x86_64', 'aarch64']
-        toolkit: ['cuda-12.9', 'cuda-13.0']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22-large' || 'ubuntu-22-large-arm' }}
-    env:
-      PYPI_RELEASE: 1
-      DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/setup-linux
-        with:
-          toolkit: ${{ matrix.toolkit }}
-          use-ccache: false
-      - name: Build Python package
-        uses: ./.github/actions/build-cuda-release
-        with:
-          arch: ${{ matrix.arch }}
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          overwrite: true
-          name: mlx-cuda
-          path: wheelhouse/mlx_cuda-*.whl
-
-  pypi-publish:
-    name: Upload release to PyPI
-    runs-on: ubuntu-latest
-    needs: [setup, build_linux_release, build_mac_release]
-    permissions:
-      id-token: write
-    environment:
-      name: pypi
-      url: https://pypi.org/p/mlx
-    steps:
-      - uses: actions/download-artifact@v7
-        with:
-          pattern: linux-wheels-*
-          merge-multiple: true
-          path: dist
-      - uses: actions/download-artifact@v7
-        with:
-          pattern: mac-wheels-*
-          merge-multiple: true
-          path: dist
-      - name: Display structure of downloaded files
-        run: ls -R dist
-      - name: Publish package distributions to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://upload.pypi.org/legacy/
-  
-  pypi-publish-cuda:
-    name: Upload CUDA release to PyPI
-    runs-on: ubuntu-latest
-    needs: [setup, build_cuda_release]
-    permissions:
-      id-token: write
-    environment:
-      name: pypi
-      url: https://pypi.org/p/mlx-cuda
-    steps:
-      - uses: actions/download-artifact@v7
-        with:
-          name: mlx-cuda
-          path: dist
-      - name: Display structure of downloaded files
-        run: ls -R dist
-      - name: Publish package distributions to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://upload.pypi.org/legacy/
-
-  pypi-publish-cpu:
-    name: Upload CPU release to PyPI
-    runs-on: ubuntu-latest
-    needs: [setup, build_linux_release]
-    permissions:
-      id-token: write
-    environment:
-      name: pypi
-      url: https://pypi.org/p/mlx-cpu
-    steps:
-      - uses: actions/download-artifact@v7
-        with:
-          pattern: mlx-cpu-*
-          merge-multiple: true
-          path: dist
-      - name: Display structure of downloaded files
-        run: ls -R dist
-      - name: Publish package distributions to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://upload.pypi.org/legacy/
-
-  pypi-publish-metal:
-    name: Upload Metal release to PyPI
-    runs-on: ubuntu-latest
-    needs: [setup, build_mac_release]
-    permissions:
-      id-token: write
-    environment:
-      name: pypi
-      url: https://pypi.org/p/mlx-metal
-    steps:
-      - uses: actions/download-artifact@v7
-        with:
-          name: mlx-metal
-          path: dist
-      - name: Display structure of downloaded files
-        run: ls -R dist
-      - name: Publish package distributions to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://upload.pypi.org/legacy/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,10 +1,4 @@
 repos:
-   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v6.0.0
-    hooks:
-    -   id: check-yaml
-    # -   id: end-of-file-fixer
-    # -   id: trailing-whitespace
 -   repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v19.1.7
    hooks:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,9 +20,13 @@ project(
  LANGUAGES C CXX
  VERSION ${MLX_PROJECT_VERSION})

+if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+  add_compile_options(-Wall -Wextra)
+endif()
+
 # ----------------------------- Setup -----------------------------
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_INSTALL_MESSAGE NEVER)
@@ -74,7 +78,6 @@ endif()
 if(MLX_USE_CCACHE)
  find_program(CCACHE_PROGRAM ccache)
  if(CCACHE_PROGRAM)
-    message(STATUS "Found CCache: ${CCACHE_PROGRAM}")
    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
@@ -89,11 +92,6 @@ cmake_policy(SET CMP0135 NEW)

 add_library(mlx)

-# Supress warnings: note: parameter passing for argument of type
-# ‘std::pair<float, float>’ when C++17 is enabled changed to match C++14 in GCC
-# 10.1
-target_compile_options(mlx PRIVATE -Wno-psabi)
-
 if(MLX_BUILD_CUDA)
  enable_language(CUDA)
 endif()
@@ -128,12 +126,9 @@ if(MLX_BUILD_METAL)
  message(STATUS "Building with macOS SDK version ${MACOS_SDK_VERSION}")

  set(METAL_CPP_URL
-      https://developer.apple.com/metal/cpp/files/metal-cpp_26.zip)
+      https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18.zip)

  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
-    if(${CMAKE_OSX_DEPLOYMENT_TARGET} LESS 14.0)
-      message(FATAL_ERROR "MLX requires macOS >= 14.0")
-    endif()
    set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
  endif()
  execute_process(
@@ -142,6 +137,7 @@ if(MLX_BUILD_METAL)
      "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
    OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
  FetchContent_Declare(metal_cpp URL ${METAL_CPP_URL})
+
  FetchContent_MakeAvailable(metal_cpp)
  target_include_directories(
    mlx PUBLIC $<BUILD_INTERFACE:${metal_cpp_SOURCE_DIR}>
@@ -273,7 +269,7 @@ target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:fmt::fmt-header-only>)
 if(MLX_BUILD_PYTHON_BINDINGS)
  message(STATUS "Building Python bindings.")
  find_package(
-    Python 3.10
+    Python 3.8
    COMPONENTS Interpreter Development.Module
    REQUIRED)
  execute_process(
--- a/benchmarks/cpp/irregular_strides.cpp
+++ b/benchmarks/cpp/irregular_strides.cpp
@@ -75,7 +75,7 @@ void time_irregular_binary_ops_3D() {

 void time_irregular_binary_ops_4D() {
  auto device = mx::default_device();
-  mx::Shape shape = {8, 8, 512, 512};
+  std::vector<int> shape = {8, 8, 512, 512};
  auto a = mx::random::uniform(shape);
  auto b = mx::random::uniform(shape);

@@ -115,7 +115,7 @@ void time_irregular_binary_ops_4D() {

 void time_irregular_reshape() {
  auto device = mx::default_device();
-  mx::Shape shape;
+  std::vector<int> shape;
  auto reshape_fn = [&shape, device](const mx::array& a) {
    return mx::reshape(a, shape, device);
  };
@@ -170,7 +170,7 @@ void time_irregular_astype_1D() {
 void time_irregular_astype_2D() {
  auto device = mx::default_device();
  int size = 2048;
-  mx::Shape shape = {size, size};
+  std::vector<int> shape = {size, size};

  auto a = mx::random::uniform(shape);
  TIMEM("2D regular", mx::astype, a, mx::int32, device);
--- a/benchmarks/python/blas/bench_gemv.py
+++ b/benchmarks/python/blas/bench_gemv.py
@@ -1,5 +1,6 @@
 # Copyright © 2023 Apple Inc.

+import argparse
 import os
 import subprocess
 import time
--- a/benchmarks/python/masked_scatter.py
+++ b/benchmarks/python/masked_scatter.py
@@ -1,212 +0,0 @@
-import math
-import os
-import subprocess
-import time
-from copy import copy
-from functools import partial
-
-import matplotlib.pyplot as plt
-import mlx.core as mx
-import numpy as np
-import torch
-from matplotlib.ticker import FuncFormatter
-
-RESULTS_DIR = "./results"
-
-
-if not os.path.isdir(RESULTS_DIR):
-    os.mkdir(RESULTS_DIR)
-
-DEVICE_NAME = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
-DEVICE_NAME = DEVICE_NAME.decode("utf-8").strip("\n")
-
-TORCH_DEVICE = torch.device(
-    "mps"
-    if torch.backends.mps.is_available()
-    else ("cuda" if torch.cuda.is_available() else "cpu")
-)
-
-
-N_WARMUP = 5
-N_ITER_BENCH = 50
-N_ITER_FUNC = 20
-
-VECTOR_LENGTHS = [4096 * (2**i) for i in range(10)]
-MASK_DENSITIES = [0.01, 0.1, 0.25, 0.5]
-D_TYPES = ("float32", "float16")
-
-
-def _power_of_two_formatter(value, _position):
-    if value <= 0:
-        return ""
-    exponent = int(round(math.log2(value)))
-    if abs(value - (1 << exponent)) / value > 1e-6:
-        return f"{value:g}"
-    return f"$2^{{{exponent}}}$"
-
-
-def torch_sync():
-    if TORCH_DEVICE.type == "cuda":
-        torch.cuda.synchronize()
-    elif TORCH_DEVICE.type == "mps":
-        torch.mps.synchronize()
-
-
-def masked_scatter_mlx(self_arr, mask_arr, src_arr):
-    outs = []
-    for _ in range(N_ITER_FUNC):
-        out = copy(self_arr)
-        out[mask_arr] = src_arr
-        outs.append(out)
-    mx.eval(outs)
-    return outs
-
-
-@torch.no_grad()
-def masked_scatter_torch(self_tensor, mask_tensor, src_tensor):
-    outs = []
-    for _ in range(N_ITER_FUNC):
-        out = self_tensor.clone()
-        out.masked_scatter_(mask_tensor, src_tensor)
-        outs.append(out)
-    torch_sync()
-    return outs
-
-
-def measure(fn):
-    for _ in range(N_WARMUP):
-        fn()
-    start = time.perf_counter_ns()
-    for _ in range(N_ITER_BENCH):
-        fn()
-    end = time.perf_counter_ns()
-    return (end - start) * 1e-9
-
-
-def bytes_touched(length, true_count, item_size):
-    mask_bytes = length
-    self_bytes = length * item_size * 2  # read + write
-    src_bytes = true_count * item_size
-    return (mask_bytes + self_bytes + src_bytes) * N_ITER_FUNC * N_ITER_BENCH
-
-
-def build_case(length, density, np_dtype, torch_dtype):
-    true_count = max(1, int(round(length * density)))
-
-    rng = np.random.default_rng()
-    self_np = rng.normal(0.0, 1.0, length).astype(np_dtype)
-    mask_np = np.zeros(length, dtype=bool)
-    mask_np[:true_count] = True
-    rng.shuffle(mask_np)
-    src_np = rng.normal(0.0, 1.0, true_count).astype(np_dtype)
-
-    self_mlx = mx.array(self_np)
-    mask_mlx = mx.array(mask_np)
-    src_mlx = mx.array(src_np)
-
-    self_torch = torch.from_numpy(self_np).to(device=TORCH_DEVICE, dtype=torch_dtype)
-    mask_torch = torch.from_numpy(mask_np).to(device=TORCH_DEVICE)
-    src_torch = torch.from_numpy(src_np).to(device=TORCH_DEVICE, dtype=torch_dtype)
-
-    # Correctness check once per configuration
-    mx_out = mx.array(self_np)
-    mx_out[mask_mlx] = src_mlx
-    mx.eval(mx_out)
-    torch_out = self_torch.clone()
-    torch_out.masked_scatter_(mask_torch, src_torch)
-
-    atol = 5e-3 if np_dtype == np.float16 else 1e-5
-    if not np.allclose(np.array(mx_out), torch_out.cpu().numpy(), atol=atol):
-        raise AssertionError("masked_scatter results diverged between MLX and Torch")
-
-    return (self_mlx, mask_mlx, src_mlx, self_torch, mask_torch, src_torch, true_count)
-
-
-def bench_case(length, density, dtype):
-    np_dtype = getattr(np, dtype)
-    torch_dtype = getattr(torch, dtype)
-    (
-        self_mlx,
-        mask_mlx,
-        src_mlx,
-        self_torch,
-        mask_torch,
-        src_torch,
-        true_count,
-    ) = build_case(length, density, np_dtype, torch_dtype)
-
-    time_mlx = measure(partial(masked_scatter_mlx, self_mlx, mask_mlx, src_mlx))
-    time_torch = measure(
-        partial(masked_scatter_torch, self_torch, mask_torch, src_torch)
-    )
-
-    total_bytes = bytes_touched(length, true_count, np_dtype().itemsize)
-    bytes_per_gb = float(1024**3)
-    mlx_gbps = (total_bytes / bytes_per_gb) / time_mlx
-    torch_gbps = (total_bytes / bytes_per_gb) / time_torch
-
-    return time_mlx, time_torch, mlx_gbps, torch_gbps
-
-
-def plot_density(ax_perf, ax_speedup, density, dtype):
-    mlx_gbps = []
-    torch_gbps = []
-    mlx_times = []
-    torch_times = []
-
-    for length in VECTOR_LENGTHS:
-        t_mlx, t_torch, gbps_mlx, gbps_torch = bench_case(length, density, dtype)
-        mlx_gbps.append(gbps_mlx)
-        torch_gbps.append(gbps_torch)
-        mlx_times.append(t_mlx)
-        torch_times.append(t_torch)
-
-    ax_perf.plot(VECTOR_LENGTHS, mlx_gbps, "tab:blue", label="MLX")
-    ax_perf.plot(VECTOR_LENGTHS, torch_gbps, "tab:red", label="Torch")
-    ax_perf.set_xscale("log", base=2)
-    ax_perf.set_xticks(VECTOR_LENGTHS)
-    formatter = FuncFormatter(_power_of_two_formatter)
-    ax_perf.xaxis.set_major_formatter(formatter)
-    ax_perf.set_title(f"density={density:.2f}")
-    ax_perf.set_ylabel("GB/s")
-    ax_perf.grid(True, which="both", linestyle=":", alpha=0.4)
-    ax_perf.legend()
-
-    speedup = np.array(torch_times) / np.array(mlx_times)
-    ax_speedup.plot(VECTOR_LENGTHS, speedup, "tab:green")
-    ax_speedup.axhline(1.0, color="tab:gray", linestyle="--")
-    ax_speedup.set_xscale("log", base=2)
-    ax_speedup.set_xticks(VECTOR_LENGTHS)
-    ax_speedup.xaxis.set_major_formatter(formatter)
-    ax_speedup.set_ylabel("Speedup (Torch_t / MLX_t)")
-    ax_speedup.grid(True, which="both", linestyle=":", alpha=0.4)
-
-
-def main():
-    for dtype in D_TYPES:
-        fig, axs = plt.subplots(
-            len(MASK_DENSITIES),
-            2,
-            figsize=(10, 12),
-            layout="constrained",
-            sharex=True,
-        )
-
-        for i, density in enumerate(MASK_DENSITIES):
-            plot_density(axs[i][0], axs[i][1], density, dtype)
-            axs[i][0].set_xlabel("vector length")
-            axs[i][1].set_xlabel("vector length")
-
-        fig.suptitle(
-            f"{DEVICE_NAME.replace('Apple ', '')} ({TORCH_DEVICE.type}) | dtype={dtype}"
-        )
-        output_path = os.path.join(
-            RESULTS_DIR,
-            f"{DEVICE_NAME.replace(' ', '_')}_masked_scatter_{dtype}.pdf",
-        )
-        fig.savefig(output_path)
-        plt.close(fig)
-
-
-if __name__ == "__main__":
-    main()
--- a/cmake/Findnvpl.cmake
+++ b/cmake/Findnvpl.cmake
@@ -1,3 +0,0 @@
-# This file does nothing but to suppress the cmake warning: "By not providing
-# Findnvpl.cmake in CMAKE_MODULE_PATH...", which is caused by the
-# find_package(nvpl) from cmake's builtin FindLAPACK.cmake module.
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -17,10 +17,11 @@ To install from PyPI your system must meet the following requirements:

 - Using an M series chip (Apple silicon)
 - Using a native Python >= 3.10
- macOS >= 14.0
+- macOS >= 13.5

 .. note::
-    MLX is only available on devices running macOS >= 14.0 and higher.
+    MLX is only available on devices running macOS >= 13.5
+    It is highly recommended to use macOS 14 (Sonoma)

 CUDA
 ^^^^
@@ -29,20 +30,17 @@ MLX has a CUDA backend which you can install with:

 .. code-block:: shell

-    pip install mlx[cuda12]
-
+    pip install mlx[cuda]

 To install the CUDA package from PyPi your system must meet the following
 requirements:

- Nvidia architecture >= SM 7.5
+- Nvidia architecture >= SM 7.0 (Volta)
 - Nvidia driver >= 550.54.14
 - CUDA toolkit >= 12.0
 - Linux distribution with glibc >= 2.35
 - Python >= 3.10

-For CUDA 13 use ``pip install mlx[cuda13]``. The CUDA 13 package requires
-an Nvidia driver >= 580 or an appropriate CUDA compatibility package.

 CPU-only (Linux)
 ^^^^^^^^^^^^^^^^
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -7,13 +7,12 @@ Distributed Communication

 MLX supports distributed communication operations that allow the computational cost
 of training or inference to be shared across many physical machines. At the
-moment we support three different communication backends:
+moment we support two different communication backends:

 * `MPI <https://en.wikipedia.org/wiki/Message_Passing_Interface>`_ a
  full-featured and mature distributed communications library
-* A **ring** backend of our own that uses native TCP sockets. It should be
-  faster for thunderbolt connections, but it also works over Ethernet.
-* `nccl <https://developer.nvidia.com/nccl>`_, for use in CUDA environments.
+* A **ring** backend of our own that uses native TCP sockets and should be
+  faster for thunderbolt connections.

 The list of all currently supported operations and their documentation can be
 seen in the :ref:`API docs<distributed>`.
@@ -85,8 +84,9 @@ Selecting Backend
 ^^^^^^^^^^^^^^^^^

 You can select the backend you want to use when calling :func:`init` by passing
-one of ``{'any', 'ring', 'mpi', 'nccl'}``. When passing ``any``, MLX will try all
-available backends. If they all fail then a singleton group is created.
+one of ``{'any', 'ring', 'mpi'}``. When passing ``any``, MLX will try to
+initialize the ``ring`` backend and if it fails the ``mpi`` backend. If they
+both fail then a singleton group is created.

 .. note::
   After a distributed backend is successfully initialized :func:`init` will
@@ -220,7 +220,7 @@ print 4 etc.
 Installing MPI
 ^^^^^^^^^^^^^^

-MPI can be installed with Homebrew, pip, using the Anaconda package manager, or
+MPI can be installed with Homebrew, using the Anaconda package manager or
 compiled from source. Most of our testing is done using ``openmpi`` installed
 with the Anaconda package manager as follows:

@@ -228,16 +228,14 @@ with the Anaconda package manager as follows:

    $ conda install conda-forge::openmpi

-Installing with Homebrew or pip requires specifying the location of ``libmpi.dyld``
+Installing with Homebrew may require specifying the location of ``libmpi.dyld``
 so that MLX can find it and load it at runtime. This can simply be achieved by
 passing the ``DYLD_LIBRARY_PATH`` environment variable to ``mpirun`` and it is
-done automatically by ``mlx.launch``. Some environments use a non-standard
-library filename that can be specified using the ``MPI_LIBNAME`` environment
-variable. This is automatically taken care of by ``mlx.launch`` as well.
+done automatically by ``mlx.launch``.

 .. code:: shell

-    $ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ -x MPI_LIBNAME=libmpi.40.dylib python test.py
+    $ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python test.py
    $ # or simply
    $ mlx.launch -n 2 test.py

--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -70,8 +70,7 @@ Differences from NumPy

  * Indexing does not perform bounds checking. Indexing out of bounds is
    undefined behavior.
-  * Boolean mask based indexing is supported for assignment only (see
-    :ref:`boolean-mask-assignment`).
+  * Boolean mask based indexing is not yet supported.

 The reason for the lack of bounds checking is that exceptions cannot propagate
 from the GPU. Performing bounds checking for array indices before launching the
@@ -144,51 +143,3 @@ expected. For example:

 In the above ``dfdx`` will have the correct gradient, namely zeros at ``idx``
 and ones elsewhere.
-
-.. _boolean-mask-assignment:
-
-Boolean Mask Assignment
-----------------------
-
-MLX supports boolean indices using NumPy syntax. A mask must already be
-a :class:`bool_` MLX :class:`array` or a NumPy ``ndarray`` with ``dtype=bool``.
-Other index types are routed through the standard scatter code.
-
-.. code-block:: shell
-
-   >>> a = mx.array([1.0, 2.0, 3.0])
-   >>> mask = mx.array([True, False, True])
-   >>> updates = mx.array([5.0, 6.0])
-   >>> a[mask] = updates
-   >>> a
-   array([5.0, 2.0, 6.0], dtype=float32)
-
-Scalar assignments broadcast to every ``True`` entry in ``mask``. For non-scalar
-assignments, ``updates`` must provide at least as many elements as there are
-``True`` entries in ``mask``.
-
-.. code-block:: shell
-
-   >>> a = mx.zeros((2, 3))
-   >>> mask = mx.array([[True, False, True],
-                        [False, False, True]])
-   >>> a[mask] = 1.0
-   >>> a
-   array([[1.0, 0.0, 1.0],
-          [0.0, 0.0, 1.0]], dtype=float32)
-
-Boolean masks follow NumPy semantics:
-
- The mask shape must match the shape of the axes it indexes exactly. The only
-  exception is a scalar boolean mask, which broadcasts to the full array.
- Any axes not covered by the mask are taken in full.
-
-.. code-block:: shell
-
-   >>> a = mx.arange(1000).reshape(10, 10, 10)
-   >>> a[mx.random.normal((10, 10)) > 0.0] = 0  # valid: mask covers axes 0 and 1
-
-The mask of shape ``(10, 10)`` applies to the first two axes, so ``a[mask]``
-selects the 1-D slices ``a[i, j, :]`` where ``mask[i, j]`` is ``True``.
-Shapes such as ``(1, 10, 10)`` or ``(10, 10, 1)`` do not match the indexed
-axes and therefore raise errors.
--- a/examples/cpp/tutorial.cpp
+++ b/examples/cpp/tutorial.cpp
@@ -14,14 +14,17 @@ void array_basics() {
  // Get the value out of it:
  auto s = x.item<float>();
  assert(s == 1.0);
+  (void)s;

  // Scalars have a size of 1:
-  size_t size = x.size();
+  int64_t size = x.size();
  assert(size == 1);
+  (void)size;

  // Scalars have 0 dimensions:
  int ndim = x.ndim();
  assert(ndim == 0);
+  (void)ndim;

  // The shape should be an empty vector:
  auto shape = x.shape();
@@ -30,6 +33,7 @@ void array_basics() {
  // The datatype should be float32:
  auto dtype = x.dtype();
  assert(dtype == mx::float32);
+  (void)dtype;

  // Specify the dtype when constructing the array:
  x = mx::array(1, mx::int32);
--- a/examples/extensions/pyproject.toml
+++ b/examples/extensions/pyproject.toml
@@ -3,6 +3,6 @@ requires = [
  "setuptools>=42",
  "cmake>=3.25",
  "mlx>=0.18.0",
-  "nanobind==2.10.2",
+  "nanobind==2.4.0",
 ]
 build-backend = "setuptools.build_meta"
--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
 cmake>=3.25
 mlx>=0.21.0
-nanobind==2.10.2
+nanobind==2.4.0
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -1,6 +1,7 @@
 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
--- a/mlx/allocator.cpp
+++ b/mlx/allocator.cpp
@@ -0,0 +1,24 @@
+// Copyright © 2023 Apple Inc.
+
+#include <cstdlib>
+#include <sstream>
+
+#include "mlx/allocator.h"
+
+namespace mlx::core::allocator {
+
+Buffer malloc(size_t size) {
+  auto buffer = allocator().malloc(size);
+  if (size && !buffer.ptr()) {
+    std::ostringstream msg;
+    msg << "[malloc] Unable to allocate " << size << " bytes.";
+    throw std::runtime_error(msg.str());
+  }
+  return buffer;
+}
+
+void free(Buffer buffer) {
+  allocator().free(buffer);
+}
+
+} // namespace mlx::core::allocator
--- a/mlx/allocator.h
+++ b/mlx/allocator.h
@@ -14,7 +14,7 @@ class Buffer {
  void* ptr_;

 public:
-  explicit Buffer(void* ptr) : ptr_(ptr) {};
+  Buffer(void* ptr) : ptr_(ptr) {};

  // Get the raw data pointer from the buffer
  void* raw_ptr();
@@ -28,16 +28,16 @@ class Buffer {
  };
 };

+Buffer malloc(size_t size);
+
+void free(Buffer buffer);
+
 class Allocator {
  /** Abstract base class for a memory allocator. */
 public:
  virtual Buffer malloc(size_t size) = 0;
  virtual void free(Buffer buffer) = 0;
  virtual size_t size(Buffer buffer) const = 0;
-  virtual Buffer make_buffer(void* ptr, size_t size) {
-    return Buffer{nullptr};
-  };
-  virtual void release(Buffer buffer) {}

  Allocator() = default;
  Allocator(const Allocator& other) = delete;
@@ -49,25 +49,4 @@ class Allocator {

 Allocator& allocator();

-inline Buffer malloc(size_t size) {
-  return allocator().malloc(size);
-}
-
-inline void free(Buffer buffer) {
-  allocator().free(buffer);
-}
-
-// Make a Buffer from a raw pointer of the given size without a copy.  If a
-// no-copy conversion is not possible then the returned buffer.ptr() will be
-// nullptr. Any buffer created with this function must be released with
-// release(buffer)
-inline Buffer make_buffer(void* ptr, size_t size) {
-  return allocator().make_buffer(ptr, size);
-};
-
-// Release a buffer from the allocator made with make_buffer
-inline void release(Buffer buffer) {
-  allocator().release(buffer);
-}
-
 } // namespace mlx::core::allocator
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -44,11 +44,11 @@ std::vector<array> array::make_arrays(
    const std::shared_ptr<Primitive>& primitive,
    const std::vector<array>& inputs) {
  std::vector<array> outputs;
-  for (size_t i = 0; i < shapes.size(); ++i) {
+  for (int i = 0; i < std::ssize(shapes); ++i) {
    outputs.emplace_back(std::move(shapes[i]), dtypes[i], primitive, inputs);
  }
  // For each node in |outputs|, its siblings are the other nodes.
-  for (size_t i = 0; i < outputs.size(); ++i) {
+  for (int i = 0; i < std::ssize(outputs); ++i) {
    auto siblings = outputs;
    siblings.erase(siblings.begin() + i);
    outputs[i].set_siblings(std::move(siblings), i);
@@ -64,7 +64,7 @@ array array::unsafe_weak_copy(const array& other) {
      other.strides(),
      other.flags(),
      [](auto) {});
-  cpy.array_desc_->offset = other.array_desc_->offset;
+  cpy.array_desc_->data_ptr = other.array_desc_->data_ptr;
  return cpy;
 }

@@ -82,28 +82,6 @@ array::array(std::initializer_list<int> data, Dtype dtype)
  init(data.begin());
 }

-array::array(
-    void* data,
-    Shape shape,
-    Dtype dtype,
-    const std::function<void(void*)>& deleter)
-    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
-  auto buffer = allocator::make_buffer(data, nbytes());
-  if (buffer.ptr() == nullptr) {
-    set_data(allocator::malloc(nbytes()));
-    auto ptr = static_cast<char*>(data);
-    std::copy(ptr, ptr + nbytes(), this->data<char>());
-    deleter(data);
-  } else {
-    auto wrapped_deleter = [deleter](allocator::Buffer buffer) {
-      auto ptr = buffer.ptr();
-      allocator::release(buffer);
-      return deleter(ptr);
-    };
-    set_data(buffer, std::move(wrapped_deleter));
-  }
-}
-
 /* Build an array from a shared buffer */
 array::array(allocator::Buffer data, Shape shape, Dtype dtype, Deleter deleter)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
@@ -163,12 +141,13 @@ bool array::is_tracer() const {

 void array::set_data(allocator::Buffer buffer, Deleter d) {
  array_desc_->data = std::make_shared<Data>(buffer, d);
-  array_desc_->offset = 0;
+  array_desc_->data_ptr = buffer.raw_ptr();
  array_desc_->data_size = size();
  array_desc_->flags.contiguous = true;
  array_desc_->flags.row_contiguous = true;
-  auto max_dim = std::max_element(shape().begin(), shape().end());
-  array_desc_->flags.col_contiguous = size() <= 1 || size() == *max_dim;
+  auto max_dim =
+      static_cast<int64_t>(*std::max_element(shape().begin(), shape().end()));
+  array_desc_->flags.col_contiguous = size() <= 1 || size() == max_dim;
 }

 void array::set_data(
@@ -178,7 +157,7 @@ void array::set_data(
    Flags flags,
    Deleter d) {
  array_desc_->data = std::make_shared<Data>(buffer, d);
-  array_desc_->offset = 0;
+  array_desc_->data_ptr = buffer.raw_ptr();
  array_desc_->data_size = data_size;
  array_desc_->strides = std::move(strides);
  array_desc_->flags = flags;
@@ -189,13 +168,14 @@ void array::copy_shared_buffer(
    const Strides& strides,
    Flags flags,
    size_t data_size,
-    int64_t offset /* = 0 */) {
+    size_t offset /* = 0 */) {
  array_desc_->data = other.array_desc_->data;
  array_desc_->strides = strides;
  array_desc_->flags = flags;
  array_desc_->data_size = data_size;
-  array_desc_->offset =
-      sizeof(char) * itemsize() * offset + other.array_desc_->offset;
+  auto char_offset = sizeof(char) * itemsize() * offset;
+  array_desc_->data_ptr = static_cast<void*>(
+      static_cast<char*>(other.array_desc_->data_ptr) + char_offset);
 }

 void array::copy_shared_buffer(const array& other) {
@@ -213,7 +193,7 @@ array::~array() {
  }

  // Break circular reference for non-detached arrays with siblings
-  if (auto n = siblings().size(); n > 0) {
+  if (auto n = std::ssize(siblings()); n > 0) {
    bool do_detach = true;
    // If all siblings have siblings.size() references except
    // the one we are currently destroying (which has siblings.size() + 1)
@@ -295,7 +275,7 @@ array::ArrayDesc::~ArrayDesc() {
    ad.inputs.clear();
    for (auto& [_, a] : input_map) {
      bool is_deletable =
-          (a.array_desc_.use_count() <= a.siblings().size() + 1);
+          (a.array_desc_.use_count() <= std::ssize(a.siblings()) + 1);
      // An array with siblings is deletable only if all of its siblings
      // are deletable
      for (auto& s : a.siblings()) {
@@ -304,7 +284,7 @@ array::ArrayDesc::~ArrayDesc() {
        }
        int is_input = (input_map.find(s.id()) != input_map.end());
        is_deletable &=
-            s.array_desc_.use_count() <= a.siblings().size() + is_input;
+            s.array_desc_.use_count() <= std::ssize(a.siblings()) + is_input;
      }
      if (is_deletable) {
        for_deletion.push_back(std::move(a.array_desc_));
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -57,16 +57,6 @@ class array {
      Shape shape,
      Dtype dtype = TypeToDtype<T>());

-  /* Build an array from a raw pointer. The constructor will attempt to use the
-   * input data without a copy. The deleter will be called when the array no
-   * longer needs the underlying memory - after the array is destroyed in the
-   * no-copy case and after the copy otherwise. */
-  explicit array(
-      void* data,
-      Shape shape,
-      Dtype dtype,
-      const std::function<void(void*)>& deleter);
-
  /* Build an array from a buffer */
  explicit array(
      allocator::Buffer data,
@@ -91,22 +81,22 @@ class array {
  }

  /** The size of the array's datatype in bytes. */
-  size_t itemsize() const {
+  int itemsize() const {
    return size_of(dtype());
  }

  /** The number of elements in the array. */
-  size_t size() const {
+  int64_t size() const {
    return array_desc_->size;
  }

  /** The number of bytes in the array. */
-  size_t nbytes() const {
+  int64_t nbytes() const {
    return size() * itemsize();
  }

  /** The number of dimensions of the array. */
-  size_t ndim() const {
+  int ndim() const {
    return array_desc_->shape.size();
  }

@@ -304,11 +294,6 @@ class array {
    return array_desc_->siblings;
  }

-  /** The array's position in the sibling list. */
-  int sibling_position() const {
-    return array_desc_->position;
-  }
-
  void set_siblings(std::vector<array> siblings, uint16_t position) {
    array_desc_->siblings = std::move(siblings);
    array_desc_->position = position;
@@ -344,7 +329,7 @@ class array {
   * corresponding to ``arr[-1, -1, ...]``) then ``data_size = last - first``.
   * Note, ``data_size`` is in units of ``item_size`` (not bytes).
   **/
-  size_t data_size() const {
+  int64_t data_size() const {
    return array_desc_->data_size;
  }

@@ -355,7 +340,7 @@ class array {
    return array_desc_->data->buffer;
  }

-  size_t buffer_size() const {
+  int64_t buffer_size() const {
    return allocator::allocator().size(buffer());
  }

@@ -364,23 +349,15 @@ class array {
    return array_desc_->data;
  }

-  // Return a raw pointer to the arrays data. This function may do a copy if
-  // the underlying buffer is not accessible on the CPU. When accessing the
-  // data for GPU kernels, be sure to use the correct method / function for the
-  // given backend to access the GPU pointer.
+  // Return a raw pointer to the arrays data
  template <typename T>
  T* data() {
-    return reinterpret_cast<T*>(
-        (static_cast<char*>(buffer().raw_ptr()) + array_desc_->offset));
+    return static_cast<T*>(array_desc_->data_ptr);
  }

  template <typename T>
  const T* data() const {
-    return const_cast<array&>(*this).data<T>();
-  }
-
-  int64_t offset() const {
-    return array_desc_->offset;
+    return static_cast<T*>(array_desc_->data_ptr);
  }

  enum Status {
@@ -449,7 +426,7 @@ class array {
      const Strides& strides,
      Flags flags,
      size_t data_size,
-      int64_t offset = 0);
+      size_t offset = 0);

  void copy_shared_buffer(const array& other);

@@ -484,8 +461,8 @@ class array {
    // can share the underlying data buffer.
    std::shared_ptr<Data> data;

-    // Offset from beginning of data pointer
-    int64_t offset{0};
+    // Properly offset data pointer
+    void* data_ptr{nullptr};

    // The size in elements of the data buffer the array accesses
    size_t data_size;
@@ -553,7 +530,7 @@ array::array(
    Shape shape,
    Dtype dtype /* = TypeToDtype<T>() */)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
-  if (data.size() != size()) {
+  if (std::ssize(data) != size()) {
    throw std::invalid_argument(
        "Data size and provided shape mismatch in array construction.");
  }
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -38,20 +38,20 @@ inline void set_binary_op_output_data(
    const array& a,
    const array& b,
    array& out,
-    BinaryOpType bopt,
-    std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
+    BinaryOpType bopt) {
  bool b_donatable = is_donatable(b, out);
  bool a_donatable = is_donatable(a, out);
  switch (bopt) {
    case BinaryOpType::ScalarScalar:
-      out.set_data(mallocfn(out.itemsize()), 1, a.strides(), a.flags());
+      out.set_data(
+          allocator::malloc(out.itemsize()), 1, a.strides(), a.flags());
      break;
    case BinaryOpType::ScalarVector:
      if (b_donatable) {
        out.copy_shared_buffer(b);
      } else {
        out.set_data(
-            mallocfn(b.data_size() * out.itemsize()),
+            allocator::malloc(b.data_size() * out.itemsize()),
            b.data_size(),
            b.strides(),
            b.flags());
@@ -62,7 +62,7 @@ inline void set_binary_op_output_data(
        out.copy_shared_buffer(a);
      } else {
        out.set_data(
-            mallocfn(a.data_size() * out.itemsize()),
+            allocator::malloc(a.data_size() * out.itemsize()),
            a.data_size(),
            a.strides(),
            a.flags());
@@ -75,7 +75,7 @@ inline void set_binary_op_output_data(
        out.copy_shared_buffer(b);
      } else {
        out.set_data(
-            mallocfn(a.data_size() * out.itemsize()),
+            allocator::malloc(a.data_size() * out.itemsize()),
            a.data_size(),
            a.strides(),
            a.flags());
@@ -88,7 +88,7 @@ inline void set_binary_op_output_data(
          b_donatable && b.flags().row_contiguous && b.size() == out.size()) {
        out.copy_shared_buffer(b);
      } else {
-        out.set_data(mallocfn(out.nbytes()));
+        out.set_data(allocator::malloc(out.nbytes()));
      }
      break;
  }
--- a/mlx/backend/common/broadcasting.cpp
+++ b/mlx/backend/common/broadcasting.cpp
@@ -6,7 +6,7 @@ namespace mlx::core {

 void broadcast(const array& in, array& out) {
  if (out.size() == 0) {
-    out.set_data(allocator::malloc(0));
+    out.set_data(nullptr);
    return;
  }
  Strides strides(out.ndim(), 0);
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -21,8 +21,8 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {

  // Compute the flags given the shape and strides
  bool row_contiguous = true, col_contiguous = true;
-  size_t r = 1, c = 1;
-  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
+  int64_t r = 1, c = 1;
+  for (int i = std::ssize(strides_) - 1, j = 0; i >= 0; i--, j++) {
    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
    r *= shape_[i];
@@ -60,7 +60,8 @@ void CustomTransforms::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
-  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
+  for (int i = 0, j = std::ssize(inputs) - std::ssize(outputs);
+       i < std::ssize(outputs);
       i++, j++) {
    outputs[i].copy_shared_buffer(inputs[j]);
  }
@@ -70,7 +71,7 @@ void Depends::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
-  for (int i = 0; i < outputs.size(); i++) {
+  for (int i = 0; i < std::ssize(outputs); i++) {
    outputs[i].copy_shared_buffer(inputs[i]);
  }
 }
@@ -206,11 +207,11 @@ void Split::eval(

  auto compute_new_flags = [](const auto& shape,
                              const auto& strides,
-                              size_t in_data_size,
+                              int64_t in_data_size,
                              auto flags) {
-    size_t data_size = 1;
-    size_t f_stride = 1;
-    size_t b_stride = 1;
+    int64_t data_size = 1;
+    int64_t f_stride = 1;
+    int64_t b_stride = 1;
    flags.row_contiguous = true;
    flags.col_contiguous = true;
    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
@@ -240,7 +241,7 @@ void Split::eval(

  std::vector<int> indices(1, 0);
  indices.insert(indices.end(), indices_.begin(), indices_.end());
-  for (int i = 0; i < indices.size(); i++) {
+  for (int i = 0; i < std::ssize(indices); i++) {
    size_t offset = indices[i] * in.strides()[axis_];
    auto [new_flags, data_size] = compute_new_flags(
        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
@@ -254,7 +255,7 @@ void Squeeze::eval(const std::vector<array>& inputs, array& out) {
  const auto& in = inputs[0];
  Strides strides;
  for (int i = 0, j = 0; i < in.ndim(); ++i) {
-    if (j < axes_.size() && i == axes_[j]) {
+    if (j < std::ssize(axes_) && i == axes_[j]) {
      j++;
    } else {
      strides.push_back(in.strides(i));
@@ -272,7 +273,7 @@ void Transpose::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  Strides out_strides(out.ndim());
  auto& in = inputs[0];
-  for (int ax = 0; ax < axes_.size(); ++ax) {
+  for (int ax = 0; ax < std::ssize(axes_); ++ax) {
    out_strides[ax] = in.strides()[axes_[ax]];
  }

--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -114,15 +114,13 @@ void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::function<bool(size_t)>& is_constant,
-    bool contiguous,
-    const std::function<allocator::Buffer(size_t)>&
-        mallocfn /* = allocator::malloc */) {
+    bool contiguous) {
  if (contiguous) {
    int o = 0;
    Strides strides;
    size_t data_size;
    array::Flags flags;
-    for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {
+    for (int i = 0; i < std::ssize(inputs) && o < std::ssize(outputs); ++i) {
      auto& in = inputs[i];
      // Conditions for donation
      // - Correct size
@@ -130,7 +128,7 @@ void compiled_allocate_outputs(
      // - Donatable
      // - Not a constant
      if (in.itemsize() == outputs[o].itemsize() && !is_scalar(in) &&
-          in.is_donatable() && !is_constant(i)) {
+          in.is_donatable() && is_constant(i)) {
        outputs[o++].copy_shared_buffer(in);
      }
      // Get representative input flags to properly set non-donated outputs
@@ -140,16 +138,16 @@ void compiled_allocate_outputs(
        data_size = in.data_size();
      }
    }
-    for (; o < outputs.size(); ++o) {
+    for (; o < std::ssize(outputs); ++o) {
      outputs[o].set_data(
-          mallocfn(data_size * outputs[o].itemsize()),
+          allocator::malloc(data_size * outputs[o].itemsize()),
          data_size,
          strides,
          flags);
    }
  } else {
    int o = 0;
-    for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {
+    for (int i = 0; i < std::ssize(inputs) && o < std::ssize(outputs); ++i) {
      auto& in = inputs[i];
      // Conditions for donation
      // - Row contiguous
@@ -158,14 +156,14 @@ void compiled_allocate_outputs(
      // - Not a constant
      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
-          !is_constant(i)) {
+          is_constant(i)) {
        outputs[o].copy_shared_buffer(
            in, outputs[o].strides(), in.flags(), in.data_size());
        o++;
      }
    }
-    for (; o < outputs.size(); ++o) {
-      outputs[o].set_data(mallocfn(outputs[o].nbytes()));
+    for (; o < std::ssize(outputs); ++o) {
+      outputs[o].set_data(allocator::malloc(outputs[o].nbytes()));
    }
  }
 }
@@ -195,7 +193,7 @@ std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(

    // Broadcast the inputs to the output shape.
    Strides xstrides;
-    size_t j = 0;
+    int j = 0;
    for (; j < shape.size() - x.ndim(); ++j) {
      if (shape[j] == 1) {
        xstrides.push_back(out.strides()[j]);
@@ -203,7 +201,7 @@ std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
        xstrides.push_back(0);
      }
    }
-    for (size_t i = 0; i < x.ndim(); ++i, ++j) {
+    for (int i = 0; i < x.ndim(); ++i, ++j) {
      if (x.shape(i) == 1) {
        if (shape[j] == 1) {
          xstrides.push_back(out.strides()[j]);
@@ -226,13 +224,13 @@ bool compiled_use_large_index(
    const std::vector<array>& outputs,
    bool contiguous) {
  if (contiguous) {
-    size_t max_size = 0;
+    int64_t max_size = 0;
    for (const auto& in : inputs) {
      max_size = std::max(max_size, in.data_size());
    }
    return max_size > UINT32_MAX;
  } else {
-    size_t max_size = 0;
+    int64_t max_size = 0;
    for (const auto& o : outputs) {
      max_size = std::max(max_size, o.size());
    }
--- a/mlx/backend/common/compiled.h
+++ b/mlx/backend/common/compiled.h
@@ -58,9 +58,7 @@ void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::function<bool(size_t)>& is_constant,
-    bool contiguous,
-    const std::function<allocator::Buffer(size_t)>& mallocfn =
-        allocator::malloc);
+    bool contiguous);

 // Collapse contiguous dims ignoring scalars and constants.
 std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@@ -22,11 +22,7 @@ enum class CopyType {
  GeneralGeneral
 };

-inline bool set_copy_output_data(
-    const array& in,
-    array& out,
-    CopyType ctype,
-    std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
+inline bool set_copy_output_data(const array& in, array& out, CopyType ctype) {
  if (ctype == CopyType::Vector) {
    // If the input is donateable, we are doing a vector copy and the types
    // have the same size, then the input buffer can hold the output.
@@ -35,14 +31,14 @@ inline bool set_copy_output_data(
      return true;
    } else {
      out.set_data(
-          mallocfn(in.data_size() * out.itemsize()),
+          allocator::malloc(in.data_size() * out.itemsize()),
          in.data_size(),
          in.strides(),
          in.flags());
      return false;
    }
  } else {
-    out.set_data(mallocfn(out.nbytes()));
+    out.set_data(allocator::malloc(out.nbytes()));
    return false;
  }
 }
--- a/mlx/backend/common/load.cpp
+++ b/mlx/backend/common/load.cpp
@@ -27,7 +27,7 @@ void swap_endianness(uint8_t* data_bytes, size_t N) {

 namespace mlx::core {

-void Load::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Load::eval_cpu(const std::vector<array>& /* inputs */, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));
  auto read_task = [out_ptr = out.data<char>(),
                    size = out.size(),
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -28,7 +28,7 @@ std::pair<Shape, Strides> shapes_without_reduction_axes(

 ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // The data is all there and we are reducing over everything
-  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
+  if (x.size() == x.data_size() && std::ssize(axes) == x.ndim() &&
      x.flags().contiguous) {
    return ContiguousAllReduce;
  }
@@ -38,7 +38,7 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
    // Merge consecutive axes
    Shape shape = {x.shape(axes[0])};
    Strides strides = {x.strides()[axes[0]]};
-    for (int i = 1; i < axes.size(); i++) {
+    for (int i = 1; i < std::ssize(axes); i++) {
      if (axes[i] - 1 == axes[i - 1] && x.shape(axes[i]) > 1) {
        shape.back() *= x.shape(axes[i]);
        strides.back() = x.strides()[axes[i]];
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -14,6 +14,10 @@ std::tuple<int64_t, Strides> prepare_slice(
    data_offset += start_indices[i] * in.strides()[i];
    inp_strides[i] = in.strides()[i] * strides[i];
  }
+  // Normalize the offset
+  if (data_offset < 0) {
+    data_offset += in.data_size();
+  }
  return std::make_tuple(data_offset, inp_strides);
 }

@@ -21,7 +25,7 @@ void shared_buffer_slice(
    const array& in,
    const Strides& out_strides,
    int64_t data_offset,
-    size_t data_size,
+    int64_t data_size,
    array& out) {
  // Compute row/col contiguity
  auto [no_bsx_size, is_row_contiguous, is_col_contiguous] =
@@ -41,30 +45,23 @@ void slice(
    const Shape& start_indices,
    const Shape& strides) {
  if (out.size() == 0) {
-    out.set_data(allocator::malloc(0));
+    out.set_data(nullptr);
    return;
  }

  // Calculate out strides, initial offset
  auto [data_offset, inp_strides] = prepare_slice(in, start_indices, strides);
-
-  // Get the location of the end based on the inp strides and out.shape()
-  int64_t low_idx = 0;
-  int64_t high_idx = 0;
-  for (int i = 0; i < inp_strides.size(); ++i) {
-    auto delta = inp_strides[i] * (out.shape()[i] - 1);
-    if (inp_strides[i] > 0) {
-      high_idx += delta;
-    } else {
-      low_idx += delta;
+  int64_t data_end = 1;
+  for (int i = 0; i < start_indices.size(); ++i) {
+    if (in.shape()[i] > 1) {
+      auto end_idx = start_indices[i] + out.shape()[i] * strides[i] - 1;
+      data_end += end_idx * in.strides()[i];
    }
  }
-  int64_t data_size = (high_idx - low_idx) + 1;
-  if (data_size < 0) {
-    std::ostringstream msg;
-    msg << "[slice] Computed invalid data size: " << data_size << ".";
-    throw std::runtime_error(msg.str());
+  if (data_end < 0) {
+    data_end += in.data_size();
  }
+  int64_t data_size = (data_end - data_offset);
  shared_buffer_slice(in, inp_strides, data_offset, data_size, out);
 }

--- a/mlx/backend/common/ternary.h
+++ b/mlx/backend/common/ternary.h
@@ -46,8 +46,7 @@ inline void set_ternary_op_output_data(
    const array& b,
    const array& c,
    array& out,
-    TernaryOpType topt,
-    std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
+    TernaryOpType topt) {
  auto maybe_donate = [&out](const array& x) {
    if (is_donatable(x, out)) {
      out.copy_shared_buffer(x);
@@ -58,12 +57,13 @@ inline void set_ternary_op_output_data(

  switch (topt) {
    case TernaryOpType::ScalarScalarScalar:
-      out.set_data(mallocfn(out.itemsize()), 1, b.strides(), b.flags());
+      out.set_data(
+          allocator::malloc(out.itemsize()), 1, b.strides(), b.flags());
      break;
    case TernaryOpType::VectorVectorVector:
      if (!(maybe_donate(a) || maybe_donate(b) || maybe_donate(c))) {
        out.set_data(
-            mallocfn(out.itemsize() * b.data_size()),
+            allocator::malloc(out.itemsize() * b.data_size()),
            b.data_size(),
            b.strides(),
            b.flags());
@@ -76,7 +76,7 @@ inline void set_ternary_op_output_data(
      if (!((a.flags().row_contiguous && maybe_donate(a)) ||
            (b.flags().row_contiguous && maybe_donate(b)) ||
            (c.flags().row_contiguous && maybe_donate(c)))) {
-        out.set_data(mallocfn(out.nbytes()));
+        out.set_data(allocator::malloc(out.nbytes()));
      }
      break;
  }
--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@@ -7,22 +7,19 @@

 namespace mlx::core {

-inline void set_unary_output_data(
-    const array& in,
-    array& out,
-    std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
+inline void set_unary_output_data(const array& in, array& out) {
  if (in.flags().contiguous) {
    if (is_donatable(in, out)) {
      out.copy_shared_buffer(in);
    } else {
      out.set_data(
-          mallocfn(in.data_size() * out.itemsize()),
+          allocator::malloc(in.data_size() * out.itemsize()),
          in.data_size(),
          in.strides(),
          in.flags());
    }
  } else {
-    out.set_data(mallocfn(out.nbytes()));
+    out.set_data(allocator::malloc(out.nbytes()));
  }
 }

--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@@ -28,7 +28,7 @@ std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    if (shape[0] != 1) {
      to_collapse.push_back(0);
    }
-    size_t size = shape[0];
+    int64_t size = shape[0];
    for (int i = 1; i < shape.size(); i++) {
      bool contiguous = true;
      size *= shape[i];
@@ -64,7 +64,7 @@ std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
      current_shape *= shape[to_collapse[k]];
    }
    out_shape.push_back(current_shape);
-    for (int j = 0; j < strides.size(); j++) {
+    for (int j = 0; j < std::ssize(strides); j++) {
      const auto& st = strides[j];
      out_strides[j].push_back(st[to_collapse[k - 1]]);
    }
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -162,7 +162,7 @@ struct ContiguousIterator {
 };

 inline auto check_contiguity(const Shape& shape, const Strides& strides) {
-  size_t no_broadcast_data_size = 1;
+  int64_t no_broadcast_data_size = 1;
  int64_t f_stride = 1;
  int64_t b_stride = 1;
  bool is_row_contiguous = true;
@@ -183,7 +183,7 @@ inline auto check_contiguity(const Shape& shape, const Strides& strides) {
 }

 inline bool is_donatable(const array& in, const array& out) {
-  constexpr size_t donation_extra = 16384;
+  constexpr int64_t donation_extra = 16384;

  return in.is_donatable() && in.itemsize() == out.itemsize() &&
      in.buffer_size() <= out.nbytes() + donation_extra;
--- a/mlx/backend/cpu/arange.h
+++ b/mlx/backend/cpu/arange.h
@@ -10,7 +10,7 @@ namespace mlx::core {
 namespace {

 template <typename T>
-void arange(T start, T next, array& out, size_t size, Stream stream) {
+void arange(T start, T next, array& out, int64_t size, Stream stream) {
  auto ptr = out.data<T>();
  auto step_size = next - start;
  auto& encoder = cpu::get_command_encoder(stream);
--- a/mlx/backend/cpu/arg_reduce.cpp
+++ b/mlx/backend/cpu/arg_reduce.cpp
@@ -19,12 +19,12 @@ void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto in_ptr = in.data<InT>();
  auto out_ptr = out.data<uint32_t>();

-  for (uint32_t i = 0; i < out.size(); ++i) {
+  for (int64_t i = 0; i < out.size(); ++i) {
    auto loc = elem_to_loc(i, shape, strides);
    auto local_in_ptr = in_ptr + loc;
    uint32_t ind_v = 0;
    InT v = (*local_in_ptr);
-    for (uint32_t j = 0; j < axis_size; ++j, local_in_ptr += axis_stride) {
+    for (int64_t j = 0; j < axis_size; ++j, local_in_ptr += axis_stride) {
      op(j, (*local_in_ptr), &ind_v, &v);
    }
    out_ptr[i] = ind_v;
--- a/mlx/backend/cpu/binary.cpp
+++ b/mlx/backend/cpu/binary.cpp
@@ -14,11 +14,238 @@

 namespace mlx::core {

+namespace {
+
+template <typename Op>
+void binary(
+    const array& a,
+    const array& b,
+    array& out,
+    Op /* op */,
+    Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (out.dtype()) {
+      case bool_:
+        binary_op<bool, Op>(a, b, out, bopt);
+        break;
+      case uint8:
+        binary_op<uint8_t, Op>(a, b, out, bopt);
+        break;
+      case uint16:
+        binary_op<uint16_t, Op>(a, b, out, bopt);
+        break;
+      case uint32:
+        binary_op<uint32_t, Op>(a, b, out, bopt);
+        break;
+      case uint64:
+        binary_op<uint64_t, Op>(a, b, out, bopt);
+        break;
+      case int8:
+        binary_op<int8_t, Op>(a, b, out, bopt);
+        break;
+      case int16:
+        binary_op<int16_t, Op>(a, b, out, bopt);
+        break;
+      case int32:
+        binary_op<int32_t, Op>(a, b, out, bopt);
+        break;
+      case int64:
+        binary_op<int64_t, Op>(a, b, out, bopt);
+        break;
+      case float16:
+        binary_op<float16_t, Op>(a, b, out, bopt);
+        break;
+      case float32:
+        binary_op<float, Op>(a, b, out, bopt);
+        break;
+      case float64:
+        binary_op<double, Op>(a, b, out, bopt);
+        break;
+      case bfloat16:
+        binary_op<bfloat16_t, Op>(a, b, out, bopt);
+        break;
+      case complex64:
+        binary_op<complex64_t, Op>(a, b, out, bopt);
+        break;
+    }
+  });
+}
+
+template <typename Op>
+void comparison_op(
+    const array& a,
+    const array& b,
+    array& out,
+    Op /* op */,
+    Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (a.dtype()) {
+      case bool_:
+        binary_op<bool, bool, Op>(a, b, out, bopt);
+        break;
+      case uint8:
+        binary_op<uint8_t, bool, Op>(a, b, out, bopt);
+        break;
+      case uint16:
+        binary_op<uint16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case uint32:
+        binary_op<uint32_t, bool, Op>(a, b, out, bopt);
+        break;
+      case uint64:
+        binary_op<uint64_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int8:
+        binary_op<int8_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int16:
+        binary_op<int16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int32:
+        binary_op<int32_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int64:
+        binary_op<int64_t, bool, Op>(a, b, out, bopt);
+        break;
+      case float16:
+        binary_op<float16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case float32:
+        binary_op<float, bool, Op>(a, b, out, bopt);
+        break;
+      case float64:
+        binary_op<double, bool, Op>(a, b, out, bopt);
+        break;
+      case bfloat16:
+        binary_op<bfloat16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case complex64:
+        binary_op<complex64_t, bool, Op>(a, b, out, bopt);
+        break;
+    }
+  });
+}
+
+template <typename Op>
+void binary_float(
+    const array& a,
+    const array& b,
+    array& out,
+    Op /* op */,
+    Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (out.dtype()) {
+      case float16:
+        binary_op<float16_t, Op>(a, b, out, bopt);
+        break;
+      case float32:
+        binary_op<float, Op>(a, b, out, bopt);
+        break;
+      case float64:
+        binary_op<double, Op>(a, b, out, bopt);
+        break;
+      case bfloat16:
+        binary_op<bfloat16_t, Op>(a, b, out, bopt);
+        break;
+      case complex64:
+        binary_op<complex64_t, Op>(a, b, out, bopt);
+        break;
+      default:
+        throw std::runtime_error(
+            "[binary_float] Only supports floating point types.");
+    }
+  });
+}
+
+template <typename Op>
+void binary_int(
+    const array& a,
+    const array& b,
+    array& out,
+    Op /* op */,
+    Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (out.dtype()) {
+      case bool_:
+        binary_op<bool, Op>(a, b, out, bopt);
+      case uint8:
+        binary_op<uint8_t, Op>(a, b, out, bopt);
+        break;
+      case uint16:
+        binary_op<uint16_t, Op>(a, b, out, bopt);
+        break;
+      case uint32:
+        binary_op<uint32_t, Op>(a, b, out, bopt);
+        break;
+      case uint64:
+        binary_op<uint64_t, Op>(a, b, out, bopt);
+        break;
+      case int8:
+        binary_op<int8_t, Op>(a, b, out, bopt);
+        break;
+      case int16:
+        binary_op<int16_t, Op>(a, b, out, bopt);
+        break;
+      case int32:
+        binary_op<int32_t, Op>(a, b, out, bopt);
+        break;
+      case int64:
+        binary_op<int64_t, Op>(a, b, out, bopt);
+        break;
+      default:
+        throw std::runtime_error("[binary_int] Type not supported");
+        break;
+    }
+  });
+}
+
+} // namespace
+
 void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Add(), stream());
+  binary(a, b, out, detail::Add(), stream());
 }

 void DivMod::eval_cpu(
@@ -102,14 +329,14 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Divide(), stream());
+  binary(a, b, out, detail::Divide(), stream());
 }

 void Remainder::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Remainder(), stream());
+  binary(a, b, out, detail::Remainder(), stream());
 }

 void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -150,90 +377,89 @@ void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
      }
    });
  } else {
-    comparison_op_cpu(a, b, out, detail::Equal(), stream());
+    comparison_op(a, b, out, detail::Equal(), stream());
  }
 }

 void Greater::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op_cpu(inputs[0], inputs[1], out, detail::Greater(), stream());
+  comparison_op(inputs[0], inputs[1], out, detail::Greater(), stream());
 }

 void GreaterEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op_cpu(
-      inputs[0], inputs[1], out, detail::GreaterEqual(), stream());
+  comparison_op(inputs[0], inputs[1], out, detail::GreaterEqual(), stream());
 }

 void Less::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op_cpu(inputs[0], inputs[1], out, detail::Less(), stream());
+  comparison_op(inputs[0], inputs[1], out, detail::Less(), stream());
 }

 void LessEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op_cpu(inputs[0], inputs[1], out, detail::LessEqual(), stream());
+  comparison_op(inputs[0], inputs[1], out, detail::LessEqual(), stream());
 }

 void LogAddExp::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_float_op_cpu(a, b, out, detail::LogAddExp(), stream());
+  binary_float(a, b, out, detail::LogAddExp(), stream());
 }

 void LogicalAnd::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
-  binary_op_cpu(in1, in2, out, detail::LogicalAnd(), stream());
+  binary(in1, in2, out, detail::LogicalAnd(), stream());
 }

 void LogicalOr::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalOr requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
-  binary_op_cpu(in1, in2, out, detail::LogicalOr(), stream());
+  binary(in1, in2, out, detail::LogicalOr(), stream());
 }

 void Maximum::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Maximum(), stream());
+  binary(a, b, out, detail::Maximum(), stream());
 }

 void Minimum::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Minimum(), stream());
+  binary(a, b, out, detail::Minimum(), stream());
 }

 void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Multiply(), stream());
+  binary(a, b, out, detail::Multiply(), stream());
 }

 void NotEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op_cpu(inputs[0], inputs[1], out, detail::NotEqual(), stream());
+  comparison_op(inputs[0], inputs[1], out, detail::NotEqual(), stream());
 }

 void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Power(), stream());
+  binary(a, b, out, detail::Power(), stream());
 }

 void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Subtract(), stream());
+  binary(a, b, out, detail::Subtract(), stream());
 }

 void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -242,19 +468,19 @@ void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];
  switch (op_) {
    case BitwiseBinary::And:
-      binary_int_op_cpu(a, b, out, detail::BitwiseAnd(), stream());
+      binary_int(a, b, out, detail::BitwiseAnd(), stream());
      break;
    case BitwiseBinary::Or:
-      binary_int_op_cpu(a, b, out, detail::BitwiseOr(), stream());
+      binary_int(a, b, out, detail::BitwiseOr(), stream());
      break;
    case BitwiseBinary::Xor:
-      binary_int_op_cpu(a, b, out, detail::BitwiseXor(), stream());
+      binary_int(a, b, out, detail::BitwiseXor(), stream());
      break;
    case BitwiseBinary::LeftShift:
-      binary_int_op_cpu(a, b, out, detail::LeftShift(), stream());
+      binary_int(a, b, out, detail::LeftShift(), stream());
      break;
    case BitwiseBinary::RightShift:
-      binary_int_op_cpu(a, b, out, detail::RightShift(), stream());
+      binary_int(a, b, out, detail::RightShift(), stream());
      break;
  }
 }
@@ -263,7 +489,7 @@ void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  const auto& a = inputs[0];
  const auto& b = inputs[1];
-  binary_float_op_cpu(a, b, out, detail::ArcTan2(), stream());
+  binary_float(a, b, out, detail::ArcTan2(), stream());
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/binary.h
+++ b/mlx/backend/cpu/binary.h
@@ -7,7 +7,6 @@
 #include "mlx/backend/common/binary.h"
 #include "mlx/backend/common/utils.h"

-#include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"

 namespace mlx::core {
@@ -291,227 +290,4 @@ void binary_op(const array& a, const array& b, array& out, BinaryOpType bopt) {
  binary_op<T, T, Op>(a, b, out, bopt);
 }

-template <typename Op>
-void binary_op_cpu(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    Stream stream) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
-  auto& encoder = cpu::get_command_encoder(stream);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-  encoder.dispatch([a = array::unsafe_weak_copy(a),
-                    b = array::unsafe_weak_copy(b),
-                    out = array::unsafe_weak_copy(out),
-                    bopt]() mutable {
-    switch (out.dtype()) {
-      case bool_:
-        binary_op<bool, Op>(a, b, out, bopt);
-        break;
-      case uint8:
-        binary_op<uint8_t, Op>(a, b, out, bopt);
-        break;
-      case uint16:
-        binary_op<uint16_t, Op>(a, b, out, bopt);
-        break;
-      case uint32:
-        binary_op<uint32_t, Op>(a, b, out, bopt);
-        break;
-      case uint64:
-        binary_op<uint64_t, Op>(a, b, out, bopt);
-        break;
-      case int8:
-        binary_op<int8_t, Op>(a, b, out, bopt);
-        break;
-      case int16:
-        binary_op<int16_t, Op>(a, b, out, bopt);
-        break;
-      case int32:
-        binary_op<int32_t, Op>(a, b, out, bopt);
-        break;
-      case int64:
-        binary_op<int64_t, Op>(a, b, out, bopt);
-        break;
-      case float16:
-        binary_op<float16_t, Op>(a, b, out, bopt);
-        break;
-      case float32:
-        binary_op<float, Op>(a, b, out, bopt);
-        break;
-      case float64:
-        binary_op<double, Op>(a, b, out, bopt);
-        break;
-      case bfloat16:
-        binary_op<bfloat16_t, Op>(a, b, out, bopt);
-        break;
-      case complex64:
-        binary_op<complex64_t, Op>(a, b, out, bopt);
-        break;
-    }
-  });
-}
-
-template <typename Op>
-void comparison_op_cpu(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    Stream stream) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
-  auto& encoder = cpu::get_command_encoder(stream);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-  encoder.dispatch([a = array::unsafe_weak_copy(a),
-                    b = array::unsafe_weak_copy(b),
-                    out = array::unsafe_weak_copy(out),
-                    bopt]() mutable {
-    switch (a.dtype()) {
-      case bool_:
-        binary_op<bool, bool, Op>(a, b, out, bopt);
-        break;
-      case uint8:
-        binary_op<uint8_t, bool, Op>(a, b, out, bopt);
-        break;
-      case uint16:
-        binary_op<uint16_t, bool, Op>(a, b, out, bopt);
-        break;
-      case uint32:
-        binary_op<uint32_t, bool, Op>(a, b, out, bopt);
-        break;
-      case uint64:
-        binary_op<uint64_t, bool, Op>(a, b, out, bopt);
-        break;
-      case int8:
-        binary_op<int8_t, bool, Op>(a, b, out, bopt);
-        break;
-      case int16:
-        binary_op<int16_t, bool, Op>(a, b, out, bopt);
-        break;
-      case int32:
-        binary_op<int32_t, bool, Op>(a, b, out, bopt);
-        break;
-      case int64:
-        binary_op<int64_t, bool, Op>(a, b, out, bopt);
-        break;
-      case float16:
-        binary_op<float16_t, bool, Op>(a, b, out, bopt);
-        break;
-      case float32:
-        binary_op<float, bool, Op>(a, b, out, bopt);
-        break;
-      case float64:
-        binary_op<double, bool, Op>(a, b, out, bopt);
-        break;
-      case bfloat16:
-        binary_op<bfloat16_t, bool, Op>(a, b, out, bopt);
-        break;
-      case complex64:
-        binary_op<complex64_t, bool, Op>(a, b, out, bopt);
-        break;
-    }
-  });
-}
-
-template <typename Op>
-void binary_float_op_cpu(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    Stream stream) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
-  auto& encoder = cpu::get_command_encoder(stream);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-  encoder.dispatch([a = array::unsafe_weak_copy(a),
-                    b = array::unsafe_weak_copy(b),
-                    out = array::unsafe_weak_copy(out),
-                    bopt]() mutable {
-    switch (out.dtype()) {
-      case float16:
-        binary_op<float16_t, Op>(a, b, out, bopt);
-        break;
-      case float32:
-        binary_op<float, Op>(a, b, out, bopt);
-        break;
-      case float64:
-        binary_op<double, Op>(a, b, out, bopt);
-        break;
-      case bfloat16:
-        binary_op<bfloat16_t, Op>(a, b, out, bopt);
-        break;
-      case complex64:
-        binary_op<complex64_t, Op>(a, b, out, bopt);
-        break;
-      default:
-        throw std::runtime_error(
-            "[binary_float] Only supports floating point types.");
-    }
-  });
-}
-
-template <typename Op>
-void binary_int_op_cpu(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    Stream stream) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
-  auto& encoder = cpu::get_command_encoder(stream);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-  encoder.dispatch([a = array::unsafe_weak_copy(a),
-                    b = array::unsafe_weak_copy(b),
-                    out = array::unsafe_weak_copy(out),
-                    bopt]() mutable {
-    switch (out.dtype()) {
-      case bool_:
-        binary_op<bool, Op>(a, b, out, bopt);
-      case uint8:
-        binary_op<uint8_t, Op>(a, b, out, bopt);
-        break;
-      case uint16:
-        binary_op<uint16_t, Op>(a, b, out, bopt);
-        break;
-      case uint32:
-        binary_op<uint32_t, Op>(a, b, out, bopt);
-        break;
-      case uint64:
-        binary_op<uint64_t, Op>(a, b, out, bopt);
-        break;
-      case int8:
-        binary_op<int8_t, Op>(a, b, out, bopt);
-        break;
-      case int16:
-        binary_op<int16_t, Op>(a, b, out, bopt);
-        break;
-      case int32:
-        binary_op<int32_t, Op>(a, b, out, bopt);
-        break;
-      case int64:
-        binary_op<int64_t, Op>(a, b, out, bopt);
-        break;
-      default:
-        throw std::runtime_error("[binary_int] Type not supported");
-        break;
-    }
-  });
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cpu/binary_two.h
+++ b/mlx/backend/cpu/binary_two.h
@@ -99,7 +99,7 @@ void binary_op_dispatch_dims(
  ContiguousIterator a_it(shape, a_strides, ndim - 2);
  ContiguousIterator b_it(shape, b_strides, ndim - 2);
  auto stride = out_strides[ndim - 3];
-  for (size_t elem = 0; elem < a.size(); elem += stride) {
+  for (int64_t elem = 0; elem < std::ssize(a); elem += stride) {
    binary_op_dims<T, U, Op, 2>(
        a_ptr + a_it.loc,
        b_ptr + b_it.loc,
@@ -137,21 +137,21 @@ void binary_op(
  if (bopt == BinaryOpType::ScalarScalar) {
    std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
  } else if (bopt == BinaryOpType::ScalarVector) {
-    for (size_t i = 0; i < b.data_size(); ++i) {
+    for (int64_t i = 0; i < b.data_size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
      b_ptr++;
    }
  } else if (bopt == BinaryOpType::VectorScalar) {
-    for (size_t i = 0; i < a.data_size(); ++i) {
+    for (int64_t i = 0; i < a.data_size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
      a_ptr++;
    }
  } else { // VectorVector
-    for (size_t i = 0; i < a.size(); ++i) {
+    for (int64_t i = 0; i < a.size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
--- a/mlx/backend/cpu/cholesky.cpp
+++ b/mlx/backend/cpu/cholesky.cpp
@@ -33,8 +33,8 @@ void cholesky_impl(const array& a, array& factor, bool upper, Stream stream) {
                    N = a.shape(-1),
                    size = a.size()]() mutable {
    char uplo = (upper) ? 'L' : 'U';
-    size_t num_matrices = size / (N * N);
-    for (int i = 0; i < num_matrices; i++) {
+    int64_t num_matrices = size / (N * N);
+    for (int64_t i = 0; i < num_matrices; i++) {
      // Compute Cholesky factorization.
      int info;
      potrf<T>(
--- a/mlx/backend/cpu/compiled.cpp
+++ b/mlx/backend/cpu/compiled.cpp
@@ -49,7 +49,7 @@ static CompilerCache& cache() {
 // GPU compile is always available if the GPU is available and since we are in
 // this file CPU compile is also available.
 namespace detail {
-bool compile_available_for_device(const Device& device) {
+bool compile_available_for_device(const Device& /* device */) {
  return true;
 }

@@ -168,7 +168,7 @@ inline void build_kernel(
  // Add the input arguments
  int cnt = 0;
  int strides_index = 1;
-  for (size_t i = 0; i < inputs.size(); ++i) {
+  for (int i = 0; i < std::ssize(inputs); ++i) {
    // Skip constants from the input list
    if (is_constant(i)) {
      continue;
@@ -238,7 +238,7 @@ inline void build_kernel(
    } else {
      os << x.primitive().name();
      os << "()(";
-      for (int i = 0; i < x.inputs().size() - 1; i++) {
+      for (int i = 0; i < std::ssize(x.inputs()) - 1; i++) {
        os << "tmp_" << namer.get_name(x.inputs()[i]) << ", ";
      }
      os << "tmp_" << namer.get_name(x.inputs().back()) << ");" << std::endl;
--- a/mlx/backend/cpu/conv.cpp
+++ b/mlx/backend/cpu/conv.cpp
@@ -860,7 +860,7 @@ void explicit_gemm_conv_1D_cpu(
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation,
+    const std::vector<int>& /* wt_dilation */,
    Stream stream) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const int iH = in.shape(1); // Input spatial dim
@@ -1003,7 +1003,7 @@ void explicit_gemm_conv_ND_cpu(
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation,
+    const std::vector<int>& /* wt_dilation */,
    const bool flip,
    Stream stream) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
@@ -1023,7 +1023,7 @@ void explicit_gemm_conv_ND_cpu(
  // Pad input
  Shape padded_shape(in.shape().size());
  padded_shape.front() = N;
-  for (size_t i = 0; i < iDim.size(); i++) {
+  for (int i = 0; i < iDim.size(); i++) {
    padded_shape[i + 1] = iDim[i] + padding_lo[i] + padding_hi[i];
  }
  padded_shape.back() = C;
@@ -1054,20 +1054,20 @@ void explicit_gemm_conv_ND_cpu(
  // Make strided view
  Shape strided_shape(oDim.size() + wDim.size() + 2);
  strided_shape.front() = N;
-  for (size_t i = 0; i < oDim.size(); i++) {
+  for (int i = 0; i < oDim.size(); i++) {
    strided_shape[i + 1] = oDim[i];
  }
-  for (size_t i = 0; i < wDim.size(); i++) {
+  for (int i = 0; i < wDim.size(); i++) {
    strided_shape[i + 1 + oDim.size()] = wDim[i];
  }
  strided_shape.back() = C;

  Strides strided_strides(in.shape().size() * 2 - 2);
  strided_strides[0] = in_padded.strides()[0];
-  for (size_t i = 0; i < wt_strides.size(); i++) {
+  for (int i = 0; i < std::ssize(wt_strides); i++) {
    strided_strides[i + 1] = in_padded.strides()[i + 1] * wt_strides[i];
  }
-  for (size_t i = 1; i < in_padded.strides().size(); i++) {
+  for (int i = 1; i < std::ssize(in_padded.strides()); i++) {
    strided_strides[i + wt_strides.size()] = in_padded.strides()[i];
  }

--- a/mlx/backend/cpu/distributed.cpp
+++ b/mlx/backend/cpu/distributed.cpp
@@ -90,14 +90,10 @@ void Recv::eval_cpu(
    std::vector<array>& outputs) {
  assert(inputs.size() == 0);
  assert(outputs.size() == 1);
+  (void)inputs;

  outputs[0].set_data(allocator::malloc(outputs[0].nbytes()));
  distributed::detail::recv(group(), outputs[0], src_, stream());
 }

-void ReduceScatter::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  throw std::runtime_error("[ReduceScatter] Not implemented yet.");
-}
 } // namespace mlx::core::distributed
--- a/mlx/backend/cpu/eig.cpp
+++ b/mlx/backend/cpu/eig.cpp
@@ -12,167 +12,6 @@ namespace mlx::core {

 namespace {

-template <typename T>
-complex64_t to_complex(T r, T i) {
-  return {static_cast<float>(r), static_cast<float>(i)};
-}
-
-template <typename T, class Enable = void>
-struct EigWork {};
-
-template <typename T>
-struct EigWork<
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  using O = complex64_t;
-
-  char jobl;
-  char jobr;
-  int N;
-  int lwork;
-  int info;
-  std::vector<array::Data> buffers;
-
-  EigWork(char jobl_, char jobr_, int N_, bool compute_eigenvectors)
-      : jobl(jobl_), jobr(jobr_), N(N_), lwork(-1) {
-    T work;
-    int n_vecs_l = compute_eigenvectors ? N_ : 1;
-    int n_vecs_r = 1;
-    geev<T>(
-        &jobl,
-        &jobr,
-        &N,
-        nullptr,
-        &N,
-        nullptr,
-        nullptr,
-        nullptr,
-        &n_vecs_l,
-        nullptr,
-        &n_vecs_r,
-        &work,
-        &lwork,
-        &info);
-    lwork = static_cast<int>(work);
-
-    buffers.emplace_back(allocator::malloc(sizeof(T) * N * 2));
-    if (compute_eigenvectors) {
-      buffers.emplace_back(allocator::malloc(sizeof(T) * N * N * 2));
-    }
-    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
-  }
-
-  void run(T* a, O* values, O* vectors) {
-    auto eig_tmp = static_cast<T*>(buffers[0].buffer.raw_ptr());
-    T* vec_tmp = nullptr;
-    if (vectors) {
-      vec_tmp = static_cast<T*>(buffers[1].buffer.raw_ptr());
-    }
-    auto work = static_cast<T*>(buffers.back().buffer.raw_ptr());
-
-    int n_vecs_l = vectors ? N : 1;
-    int n_vecs_r = 1;
-    geev<T>(
-        &jobl,
-        &jobr,
-        &N,
-        a,
-        &N,
-        eig_tmp,
-        eig_tmp + N,
-        vectors ? vec_tmp : nullptr,
-        &n_vecs_l,
-        nullptr,
-        &n_vecs_r,
-        work,
-        &lwork,
-        &info);
-
-    for (int i = 0; i < N; ++i) {
-      values[i] = to_complex(eig_tmp[i], eig_tmp[N + i]);
-    }
-
-    if (vectors) {
-      for (int i = 0; i < N; ++i) {
-        if (values[i].imag() != 0) {
-          for (int j = 0; j < N; ++j) {
-            vectors[i * N + j] =
-                to_complex(vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]);
-            vectors[(i + 1) * N + j] =
-                to_complex(vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]);
-          }
-          i += 1;
-        } else {
-          for (int j = 0; j < N; ++j) {
-            vectors[i * N + j] = to_complex(vec_tmp[i * N + j], T(0.0));
-          }
-        }
-      }
-    }
-  }
-};
-
-template <>
-struct EigWork<std::complex<float>> {
-  using T = std::complex<float>;
-  using R = float;
-  using O = T;
-
-  char jobl;
-  char jobr;
-  int N;
-  int lwork;
-  int lrwork;
-  int info;
-  std::vector<array::Data> buffers;
-
-  EigWork(char jobl_, char jobr_, int N_, bool compute_eigenvectors)
-      : jobl(jobl_), jobr(jobr_), N(N_), lwork(-1), lrwork(2 * N_) {
-    T work;
-    R rwork;
-    int n_vecs_l = compute_eigenvectors ? N_ : 1;
-    int n_vecs_r = 1;
-    geev<T>(
-        &jobl,
-        &jobr,
-        &N,
-        nullptr,
-        &N,
-        nullptr,
-        nullptr,
-        &n_vecs_l,
-        nullptr,
-        &n_vecs_r,
-        &work,
-        &lwork,
-        &rwork,
-        &info);
-    lwork = static_cast<int>(work.real());
-    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
-    buffers.emplace_back(allocator::malloc(sizeof(R) * lrwork));
-  }
-
-  void run(T* a, T* values, T* vectors) {
-    int n_vecs_l = vectors ? N : 1;
-    int n_vecs_r = 1;
-    geev<T>(
-        &jobl,
-        &jobr,
-        &N,
-        a,
-        &N,
-        values,
-        vectors,
-        &n_vecs_l,
-        nullptr,
-        &n_vecs_r,
-        static_cast<T*>(buffers[0].buffer.raw_ptr()),
-        &lwork,
-        static_cast<R*>(buffers[1].buffer.raw_ptr()),
-        &info);
-  }
-};
-
 template <typename T>
 void eig_impl(
    array& a,
@@ -180,39 +19,101 @@ void eig_impl(
    array& values,
    bool compute_eigenvectors,
    Stream stream) {
+  using OT = std::complex<T>;
  auto a_ptr = a.data<T>();
-  auto val_ptr = values.data<complex64_t>();
+  auto eig_ptr = values.data<OT>();

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(values);
-  complex64_t* vec_ptr = nullptr;
+  OT* vec_ptr = nullptr;
  if (compute_eigenvectors) {
    encoder.set_output_array(vectors);
-    vec_ptr = vectors.data<complex64_t>();
+    vec_ptr = vectors.data<OT>();
  }
  encoder.dispatch([a_ptr,
-                    val_ptr,
                    vec_ptr,
+                    eig_ptr,
                    compute_eigenvectors,
                    N = vectors.shape(-1),
                    size = vectors.size()]() mutable {
+    // Work query
    char jobr = 'N';
    char jobl = compute_eigenvectors ? 'V' : 'N';
+    int n_vecs_r = 1;
+    int n_vecs_l = compute_eigenvectors ? N : 1;
+    int lwork = -1;
+    int info;
+    {
+      T work;
+      geev<T>(
+          &jobl,
+          &jobr,
+          &N,
+          nullptr,
+          &N,
+          nullptr,
+          nullptr,
+          nullptr,
+          &n_vecs_l,
+          nullptr,
+          &n_vecs_r,
+          &work,
+          &lwork,
+          &info);
+      lwork = static_cast<int>(work);
+    }

-    EigWork<T> work(jobl, jobr, N, compute_eigenvectors);
-
-    for (size_t i = 0; i < size / (N * N); ++i) {
-      work.run(a_ptr, val_ptr, vec_ptr);
-      a_ptr += N * N;
-      val_ptr += N;
+    auto eig_tmp_data = array::Data{allocator::malloc(sizeof(T) * N * 2)};
+    auto vec_tmp_data =
+        array::Data{allocator::malloc(vec_ptr ? sizeof(T) * N * N * 2 : 0)};
+    auto eig_tmp = static_cast<T*>(eig_tmp_data.buffer.raw_ptr());
+    auto vec_tmp = static_cast<T*>(vec_tmp_data.buffer.raw_ptr());
+    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
+    for (int64_t i = 0; i < size / (N * N); ++i) {
+      geev<T>(
+          &jobl,
+          &jobr,
+          &N,
+          a_ptr,
+          &N,
+          eig_tmp,
+          eig_tmp + N,
+          vec_tmp,
+          &n_vecs_l,
+          nullptr,
+          &n_vecs_r,
+          static_cast<T*>(work_buf.buffer.raw_ptr()),
+          &lwork,
+          &info);
+      for (int i = 0; i < N; ++i) {
+        eig_ptr[i] = {eig_tmp[i], eig_tmp[N + i]};
+      }
      if (vec_ptr) {
+        for (int i = 0; i < N; ++i) {
+          if (eig_ptr[i].imag() != 0) {
+            // This vector and the next are a pair
+            for (int j = 0; j < N; ++j) {
+              vec_ptr[i * N + j] = {
+                  vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]};
+              vec_ptr[(i + 1) * N + j] = {
+                  vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]};
+            }
+            i += 1;
+          } else {
+            for (int j = 0; j < N; ++j) {
+              vec_ptr[i * N + j] = {vec_tmp[i * N + j], 0};
+            }
+          }
+        }
        vec_ptr += N * N;
      }
-      if (work.info != 0) {
+      a_ptr += N * N;
+      eig_ptr += N;
+      if (info != 0) {
        std::stringstream msg;
        msg << "[Eig::eval_cpu] Eigenvalue decomposition failed with error code "
-            << work.info;
+            << info;
        throw std::runtime_error(msg.str());
      }
    }
@@ -264,17 +165,8 @@ void Eig::eval_cpu(
    case float32:
      eig_impl<float>(a_copy, vectors, values, compute_eigenvectors_, stream());
      break;
-    case float64:
-      eig_impl<double>(
-          a_copy, vectors, values, compute_eigenvectors_, stream());
-      break;
-    case complex64:
-      eig_impl<std::complex<float>>(
-          a_copy, vectors, values, compute_eigenvectors_, stream());
-      break;
    default:
-      throw std::runtime_error(
-          "[Eig::eval_cpu] only supports float32, float64, or complex64.");
+      throw std::runtime_error("[Eig::eval_cpu] only supports float32.");
  }
 }

--- a/mlx/backend/cpu/eigh.cpp
+++ b/mlx/backend/cpu/eigh.cpp
@@ -165,7 +165,7 @@ void eigh_impl(
    EighWork<T> work(jobz, uplo, N);

    // Work loop
-    for (size_t i = 0; i < size / (N * N); ++i) {
+    for (int64_t i = 0; i < size / (N * N); ++i) {
      work.run(vec_ptr, eig_ptr);
      vec_ptr += N * N;
      eig_ptr += N;
--- a/mlx/backend/cpu/encoder.h
+++ b/mlx/backend/cpu/encoder.h
@@ -20,8 +20,8 @@ struct CommandEncoder {
  CommandEncoder(CommandEncoder&&) = delete;
  CommandEncoder& operator=(CommandEncoder&&) = delete;

-  void set_input_array(const array& a) {}
-  void set_output_array(array& a) {}
+  void set_input_array(const array& /* a */) {}
+  void set_output_array(array& /* a */) {}

  // Hold onto a temporary until any already scheduled tasks which use it as
  // an input are complete.
--- a/mlx/backend/cpu/gemm.h
+++ b/mlx/backend/cpu/gemm.h
@@ -12,12 +12,12 @@ void matmul(
    T* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
--- a/mlx/backend/cpu/gemms/bnns.cpp
+++ b/mlx/backend/cpu/gemms/bnns.cpp
@@ -34,7 +34,7 @@ void matmul_bnns(
    bool b_transposed,
    size_t lda,
    size_t ldb,
-    size_t ldc,
+    size_t /* ldc */,
    float alpha,
    float beta,
    size_t batch_size,
@@ -52,7 +52,7 @@ void matmul_bnns(
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
  if (beta != 1.0 && beta != 0.0) {
    // scale the output
-    for (auto i = 0; i < batch_size * M * N; ++i) {
+    for (size_t i = 0; i < batch_size * M * N; ++i) {
      out[i] *= beta;
    }
    beta = 1.0;
@@ -127,7 +127,7 @@ void matmul_bnns(
  auto bnns_filter =
      BNNSFilterCreateLayerBroadcastMatMul(&gemm_params, nullptr);

-  for (int i = 0; i < batch_size; ++i) {
+  for (size_t i = 0; i < batch_size; ++i) {
    BNNSFilterApplyTwoInput(
        bnns_filter,
        reinterpret_cast<const uint8_t*>(
@@ -148,12 +148,12 @@ void matmul<float16_t>(
    float16_t* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
@@ -183,12 +183,12 @@ void matmul<bfloat16_t>(
    bfloat16_t* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
--- a/mlx/backend/cpu/gemms/cblas.cpp
+++ b/mlx/backend/cpu/gemms/cblas.cpp
@@ -13,20 +13,20 @@ void matmul<float>(
    float* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
-  size_t M = a_shape[ndim - 2];
-  size_t N = b_shape[ndim - 1];
-  size_t K = a_shape[ndim - 1];
+  int64_t M = a_shape[ndim - 2];
+  int64_t N = b_shape[ndim - 1];
+  int64_t K = a_shape[ndim - 1];

  for (int i = 0; i < batch_size; ++i) {
    cblas_sgemm(
@@ -54,20 +54,20 @@ void matmul<double>(
    double* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
-  size_t M = a_shape[ndim - 2];
-  size_t N = b_shape[ndim - 1];
-  size_t K = a_shape[ndim - 1];
+  int64_t M = a_shape[ndim - 2];
+  int64_t N = b_shape[ndim - 1];
+  int64_t K = a_shape[ndim - 1];

  for (int i = 0; i < batch_size; ++i) {
    cblas_dgemm(
@@ -95,20 +95,20 @@ void matmul<complex64_t>(
    complex64_t* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
-  size_t M = a_shape[ndim - 2];
-  size_t N = b_shape[ndim - 1];
-  size_t K = a_shape[ndim - 1];
+  int64_t M = a_shape[ndim - 2];
+  int64_t N = b_shape[ndim - 1];
+  int64_t K = a_shape[ndim - 1];
  auto calpha = static_cast<complex64_t>(alpha);
  auto cbeta = static_cast<complex64_t>(beta);

--- a/mlx/backend/cpu/hadamard.cpp
+++ b/mlx/backend/cpu/hadamard.cpp
@@ -11,9 +11,9 @@ namespace mlx::core {

 // n = 2^k component
 template <typename T>
-void hadamard_n(T* out, int n, int m, float scale, size_t size) {
+void hadamard_n(T* out, int n, int /* m */, float scale, int64_t size) {
  for (int b = 0; b < size / n; b++) {
-    size_t loc = b * n;
+    int64_t loc = b * n;
    T* data_ptr = out + loc;
    int h = 1;
    int n_over_2 = n / 2;
@@ -37,7 +37,7 @@ void hadamard_n(T* out, int n, int m, float scale, size_t size) {

 // m component
 template <typename T>
-void hadamard_m(T* out, int n, int m, float scale, size_t size) {
+void hadamard_m(T* out, int n, int m, float scale, int64_t size) {
  auto h_matrices = hadamard_matrices();
  auto& matrix = h_matrices[m];
  auto start = 1;
@@ -45,7 +45,7 @@ void hadamard_m(T* out, int n, int m, float scale, size_t size) {
  std::vector<bool> hmat_vec;
  while (end != std::string_view::npos) {
    auto row = matrix.substr(start, end - start);
-    for (int i = 0; i < row.length(); i++) {
+    for (int i = 0; i < std::ssize(row); i++) {
      hmat_vec.push_back(row[i] == '+');
    }
    start = end + 1;
@@ -53,7 +53,7 @@ void hadamard_m(T* out, int n, int m, float scale, size_t size) {
  }

  for (int b = 0; b < size / m / n; b++) {
-    size_t loc = b * n * m;
+    int64_t loc = b * n * m;
    T* data_ptr = out + loc;
    for (int i = 0; i < n; i++) {
      std::vector<float> out(m);
--- a/mlx/backend/cpu/indexing.cpp
+++ b/mlx/backend/cpu/indexing.cpp
@@ -78,7 +78,7 @@ void gather(
    can_copy = true;

    // Ignore leading 1s
-    int i = 0;
+    int64_t i = 0;
    for (; i < slice_sizes.size() && slice_sizes[i] == 1; ++i)
      ;

@@ -91,7 +91,7 @@ void gather(
    can_copy = true;

    // Ignore trailing 1s
-    int i = slice_sizes.size() - 1;
+    int64_t i = slice_sizes.size() - 1;
    for (; i >= 0 && slice_sizes[i] == 1; --i)
      ;

@@ -101,11 +101,11 @@ void gather(
      can_copy = (src.shape(i) == slice_sizes[i]);
    }
  }
-  size_t slice_size = 1;
+  int64_t slice_size = 1;
  for (auto s : slice_sizes) {
    slice_size *= s;
  }
-  size_t ind_size = slice_size == 0 ? 0 : out.size() / slice_size;
+  int64_t ind_size = slice_size == 0 ? 0 : out.size() / slice_size;
  const T* src_ptr = src.data<T>();
  T* dst_ptr = out.data<T>();

@@ -115,10 +115,10 @@ void gather(
    src_it = ContiguousIterator(slice_sizes, src.strides(), src.ndim());
  }

-  size_t out_idx = 0;
-  for (int idx = 0; idx < ind_size; idx++) {
-    size_t src_idx = 0;
-    for (int ii = 0; ii < inds.size(); ++ii) {
+  int64_t out_idx = 0;
+  for (int64_t idx = 0; idx < ind_size; idx++) {
+    int64_t src_idx = 0;
+    for (int ii = 0; ii < std::ssize(inds); ++ii) {
      auto ax = axes[ii];
      auto idx_loc = its[ii].loc;
      its[ii].step();
@@ -134,7 +134,7 @@ void gather(
          src_ptr + src_idx, src_ptr + src_idx + slice_size, dst_ptr + out_idx);
      out_idx += slice_size;
    } else {
-      for (int jj = 0; jj < slice_size; jj++) {
+      for (int64_t jj = 0; jj < slice_size; jj++) {
        dst_ptr[out_idx++] = src_ptr[src_idx + src_it.loc];
        src_it.step();
      }
@@ -403,11 +403,11 @@ void scatter(
    const std::vector<int>& axes) {
  int nind = inds.size();
  auto inds_ndim = updates.ndim() - out.ndim();
-  size_t n_updates = nind ? inds[0].size() : 1;
+  int64_t n_updates = nind ? inds[0].size() : 1;

  Shape update_shape(
      updates.shape().begin() + inds_ndim, updates.shape().end());
-  size_t update_size = 1;
+  int64_t update_size = 1;
  for (auto us : update_shape) {
    update_size *= us;
  }
@@ -418,9 +418,9 @@ void scatter(

  auto out_ptr = out.data<InT>();
  auto upd_ptr = updates.data<InT>();
-  for (int i = 0; i < n_updates; ++i) {
-    size_t out_offset = 0;
-    for (int j = 0; j < inds.size(); ++j) {
+  for (int64_t i = 0; i < n_updates; ++i) {
+    int64_t out_offset = 0;
+    for (int j = 0; j < std::ssize(inds); ++j) {
      auto ax = axes[j];
      auto idx_loc = its[j].loc;
      its[j].step();
@@ -429,7 +429,7 @@ void scatter(
      out_offset += (idx_val * out.strides()[ax]);
    }
    update_it.seek(i * update_size);
-    for (int j = 0; j < update_size; ++j) {
+    for (int64_t j = 0; j < update_size; ++j) {
      OpT{}(upd_ptr[update_it.loc], out_ptr + out_offset + out_it.loc);
      update_it.step();
      out_it.step();
@@ -747,108 +747,4 @@ void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
  });
 }

-template <typename T>
-void masked_scatter_impl(const array& mask, const array& src, array& out) {
-  ContiguousIterator mask_it(mask);
-  ContiguousIterator src_it(src);
-  ContiguousIterator out_it(out);
-
-  const bool* mask_ptr = mask.data<bool>();
-  const T* src_ptr = src.data<T>();
-  T* dst_ptr = out.data<T>();
-
-  const size_t batch_count = mask.shape(0);
-  const size_t mask_batch_size = mask.size() / batch_count;
-  const size_t src_batch_size = src.size() / batch_count;
-
-  for (uint b = 0; b < batch_count; ++b) {
-    size_t src_consumed = 0;
-    src_it.seek(b * src_batch_size);
-
-    for (size_t i = 0; i < mask_batch_size; ++i) {
-      if (mask_ptr[mask_it.loc]) {
-        if (src_consumed >= src_batch_size) {
-          throw std::runtime_error(
-              "[MaskedScatter::eval_cpu] Source does not have enough elements for mask.");
-        }
-        dst_ptr[out_it.loc] = src_ptr[src_it.loc];
-        src_it.step();
-        ++src_consumed;
-      }
-      mask_it.step();
-      out_it.step();
-    }
-  }
-}
-
-void MaskedScatter::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 3);
-
-  auto& dst = inputs[0];
-  auto& mask = inputs[1];
-  auto& src = inputs[2];
-
-  // Copy src into out (copy allocates memory for out)
-  auto ctype =
-      dst.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy_cpu(dst, out, ctype, stream());
-
-  if (mask.size() == 0) {
-    return;
-  }
-
-  auto& encoder = cpu::get_command_encoder(stream());
-  encoder.set_input_array(mask);
-  encoder.set_input_array(src);
-  encoder.set_output_array(out);
-  encoder.dispatch([mask = array::unsafe_weak_copy(mask),
-                    src = array::unsafe_weak_copy(src),
-                    out = array::unsafe_weak_copy(out)]() mutable {
-    switch (out.dtype()) {
-      case bool_:
-        masked_scatter_impl<bool>(mask, src, out);
-        break;
-      case uint8:
-        masked_scatter_impl<uint8_t>(mask, src, out);
-        break;
-      case uint16:
-        masked_scatter_impl<uint16_t>(mask, src, out);
-        break;
-      case uint32:
-        masked_scatter_impl<uint32_t>(mask, src, out);
-        break;
-      case uint64:
-        masked_scatter_impl<uint64_t>(mask, src, out);
-        break;
-      case int8:
-        masked_scatter_impl<int8_t>(mask, src, out);
-        break;
-      case int16:
-        masked_scatter_impl<int16_t>(mask, src, out);
-        break;
-      case int32:
-        masked_scatter_impl<int32_t>(mask, src, out);
-        break;
-      case int64:
-        masked_scatter_impl<int64_t>(mask, src, out);
-        break;
-      case float16:
-        masked_scatter_impl<float16_t>(mask, src, out);
-        break;
-      case float32:
-        masked_scatter_impl<float>(mask, src, out);
-        break;
-      case float64:
-        masked_scatter_impl<double>(mask, src, out);
-        break;
-      case bfloat16:
-        masked_scatter_impl<bfloat16_t>(mask, src, out);
-        break;
-      case complex64:
-        masked_scatter_impl<complex64_t>(mask, src, out);
-        break;
-    }
-  });
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cpu/inverse.cpp
+++ b/mlx/backend/cpu/inverse.cpp
@@ -122,7 +122,7 @@ void inverse_impl(
      stream);

  const int N = a.shape(-1);
-  const size_t num_matrices = a.size() / (N * N);
+  const int64_t num_matrices = a.size() / (N * N);

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_output_array(inv);
@@ -130,13 +130,13 @@ void inverse_impl(
  auto inv_ptr = inv.data<T>();
  if (tri) {
    encoder.dispatch([inv_ptr, N, num_matrices, upper]() {
-      for (int i = 0; i < num_matrices; i++) {
+      for (int64_t i = 0; i < num_matrices; i++) {
        tri_inv<T>(inv_ptr + N * N * i, N, upper);
      }
    });
  } else {
    encoder.dispatch([inv_ptr, N, num_matrices]() {
-      for (int i = 0; i < num_matrices; i++) {
+      for (int64_t i = 0; i < num_matrices; i++) {
        general_inv<T>(inv_ptr + N * N * i, N);
      }
    });
--- a/mlx/backend/cpu/lapack.h
+++ b/mlx/backend/cpu/lapack.h
@@ -45,7 +45,9 @@
 INSTANTIATE_LAPACK_REAL(geqrf)
 INSTANTIATE_LAPACK_REAL(orgqr)
 INSTANTIATE_LAPACK_REAL(syevd)
+INSTANTIATE_LAPACK_REAL(geev)
 INSTANTIATE_LAPACK_REAL(potrf)
+INSTANTIATE_LAPACK_REAL(gesdd)
 INSTANTIATE_LAPACK_REAL(getrf)
 INSTANTIATE_LAPACK_REAL(getri)
 INSTANTIATE_LAPACK_REAL(trtri)
@@ -61,20 +63,3 @@ INSTANTIATE_LAPACK_REAL(trtri)
  }

 INSTANTIATE_LAPACK_COMPLEX(heevd)
-
-#define INSTANTIATE_LAPACK_ALL(FUNC)                                \
-  template <typename T, typename... Args>                           \
-  void FUNC(Args... args) {                                         \
-    if constexpr (std::is_same_v<T, float>) {                       \
-      MLX_LAPACK_FUNC(s##FUNC)(std::forward<Args>(args)...);        \
-    } else if constexpr (std::is_same_v<T, double>) {               \
-      MLX_LAPACK_FUNC(d##FUNC)(std::forward<Args>(args)...);        \
-    } else if constexpr (std::is_same_v<T, std::complex<float>>) {  \
-      MLX_LAPACK_FUNC(c##FUNC)(std::forward<Args>(args)...);        \
-    } else if constexpr (std::is_same_v<T, std::complex<double>>) { \
-      MLX_LAPACK_FUNC(z##FUNC)(std::forward<Args>(args)...);        \
-    }                                                               \
-  }
-
-INSTANTIATE_LAPACK_ALL(geev)
-INSTANTIATE_LAPACK_ALL(gesdd)
--- a/mlx/backend/cpu/masked_mm.cpp
+++ b/mlx/backend/cpu/masked_mm.cpp
@@ -25,7 +25,7 @@ inline void mask_matrix(
    const int64_t Y_data_str,
    const int64_t X_mask_str,
    const int64_t Y_mask_str,
-    const size_t mask_offset) {
+    const int64_t mask_offset) {
  int tX = (X + block_size - 1) / block_size;
  int tY = (Y + block_size - 1) / block_size;

@@ -61,13 +61,13 @@ inline void segmented_mm(
    T* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
+    int64_t lda,
+    int64_t ldb,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides,
-    size_t num_segments,
+    int64_t num_segments,
    const Shape& segments_shape,
    const Strides& segments_strides) {
  int ndim = a_shape.size();
@@ -149,9 +149,9 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto [b_transposed, ldb, b, b_copied] =
      check_transpose(b_pre, has_op_mask, inputs.back().dtype() != bool_);

-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
+  int64_t M = a.shape(-2);
+  int64_t N = b.shape(-1);
+  int64_t K = a.shape(-1);

  if (M == 0 || N == 0) {
    return;
@@ -172,8 +172,8 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
                       int batch_idx,
                       int X,
                       int Y,
-                       size_t X_data_str,
-                       size_t Y_data_str,
+                       int64_t X_data_str,
+                       int64_t Y_data_str,
                       const Shape& mask_shape,
                       const Strides& mask_strides,
                       bool is_bool) {
@@ -253,7 +253,7 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto a_ptr = a.data<float>();
  auto b_ptr = b.data<float>();
  auto out_ptr = out.data<float>();
-  size_t num_matrices = out.size() / (M * size_t(N));
+  int64_t num_matrices = out.size() / (M * int64_t(N));
  auto ldc = out.shape(-1);

  encoder.dispatch([a_ptr,
@@ -394,9 +394,9 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);

-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
+  int64_t M = a.shape(-2);
+  int64_t N = b.shape(-1);
+  int64_t K = a.shape(-1);

  if (M == 0 || N == 0) {
    return;
@@ -413,7 +413,7 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {

  // Get batch dims
  auto batch_size_out = out.size() / (M * N);
-  size_t matrix_stride_out = M * N;
+  int64_t matrix_stride_out = M * N;

  auto get_batch_dims = [](const auto& v) {
    return decltype(v){v.begin(), v.end() - 2};
--- a/mlx/backend/cpu/matmul.cpp
+++ b/mlx/backend/cpu/matmul.cpp
@@ -2,8 +2,6 @@

 #include <cstring>
 #include "mlx/array.h"
-#include "mlx/backend/cpu/binary.h"
-#include "mlx/backend/cpu/binary_ops.h"
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/gemm.h"
@@ -137,29 +135,15 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
    return;
  }

-  // Handle empty matrix case (K=0)
-  if (inputs[0].shape(-1) == 0) {
-    auto& c = inputs[2];
-    if (beta_ == 1.0f) {
-      CopyType ctype = c.data_size() == 1
-          ? CopyType::Scalar
-          : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
-      copy_cpu(c, out, ctype, stream());
-    } else {
-      array beta_scalar = array(beta_, c.dtype());
-      auto& encoder = cpu::get_command_encoder(stream());
-      binary_float_op_cpu(c, beta_scalar, out, detail::Multiply(), stream());
-      encoder.add_temporary(std::move(beta_scalar));
-    }
-    return;
-  }
-
  // Fill output with C
  auto& c = inputs[2];
  CopyType ctype = c.data_size() == 1
      ? CopyType::Scalar
      : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  copy_cpu(c, out, ctype, stream());
+  if (inputs[0].shape(-1) == 0) {
+    return;
+  }
  matmul_general(inputs[0], inputs[1], out, stream(), alpha_, beta_);
 }

--- a/mlx/backend/cpu/primitives.cpp
+++ b/mlx/backend/cpu/primitives.cpp
@@ -48,7 +48,7 @@ static std::pair<array, bool> compute_dynamic_offset(
  auto compute_offset =
      [strides, axes, offset = offset.data<int64_t>()](const auto* indices) {
        int64_t offset_ = 0;
-        for (int i = 0; i < axes.size(); ++i) {
+        for (int i = 0; i < std::ssize(axes); ++i) {
          offset_ += indices[i] * strides[axes[i]];
        }
        offset[0] = offset_;
@@ -124,6 +124,7 @@ void Transpose::eval_cpu(const std::vector<array>& inputs, array& out) {

 void Arange::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 0);
+  (void)inputs;
  out.set_data(allocator::malloc(out.nbytes()));
  switch (out.dtype()) {
    case bool_:
@@ -193,9 +194,9 @@ void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
  flags.row_contiguous = false;
  flags.col_contiguous = false;
  flags.contiguous = false;
-  for (int i = 0; i < inputs.size(); i++) {
+  for (int i = 0; i < std::ssize(inputs); i++) {
    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
-    size_t data_offset = strides[axis_] * sizes[i];
+    int64_t data_offset = strides[axis_] * sizes[i];
    out_slice.copy_shared_buffer(
        out, strides, flags, out_slice.size(), data_offset);
    copy_cpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, stream());
@@ -205,7 +206,7 @@ void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
 void Contiguous::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
-  constexpr size_t extra_bytes = 16384;
+  constexpr int64_t extra_bytes = 16384;
  if (in.buffer_size() <= out.nbytes() + extra_bytes &&
      (in.flags().row_contiguous ||
       (allow_col_major_ && in.flags().col_contiguous))) {
@@ -254,8 +255,8 @@ void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
  copy_cpu(val, out, CopyType::Scalar, stream());

  // Find offset for start of input values
-  size_t data_offset = 0;
-  for (int i = 0; i < axes_.size(); i++) {
+  int64_t data_offset = 0;
+  for (int i = 0; i < std::ssize(axes_); i++) {
    auto ax = axes_[i] < 0 ? out.ndim() + axes_[i] : axes_[i];
    data_offset += out.strides()[ax] * low_pad_size_[i];
  }
@@ -274,10 +275,10 @@ void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
  // keys has shape (N1, ..., NK, 2)
  // out has shape (N1, ..., NK, M1, M2, ...)
  auto& keys = inputs[0];
-  size_t num_keys = keys.size() / 2;
+  int64_t num_keys = keys.size() / 2;

-  size_t elems_per_key = out.size() / num_keys;
-  size_t bytes_per_key = out.itemsize() * elems_per_key;
+  int64_t elems_per_key = out.size() / num_keys;
+  int64_t bytes_per_key = out.itemsize() * elems_per_key;
  out.set_data(allocator::malloc(out.nbytes()));

  auto kptr = inputs[0].data<uint32_t>();
@@ -291,19 +292,8 @@ void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
                    num_keys,
                    kshape = keys.shape(),
                    kstrides = keys.strides()]() mutable {
-    auto copy_remaining = [&](char* cptr, size_t loc, uint32_t v) {
-      if (4 * loc + 4 <= bytes_per_key) {
-        reinterpret_cast<uint32_t*>(cptr)[loc] = v;
-      } else {
-        std::copy(
-            reinterpret_cast<char*>(&v),
-            reinterpret_cast<char*>(&v) + bytes_per_key - 4 * loc,
-            cptr + 4 * loc);
-      }
-    };
-
-    size_t out_skip = (bytes_per_key + 4 - 1) / 4;
-    auto half_size = out_skip / 2;
+    int64_t out_skip = (bytes_per_key + 4 - 1) / 4;
+    uintptr_t half_size = out_skip / 2;
    bool even = out_skip % 2 == 0;
    for (int i = 0; i < num_keys; ++i, cptr += bytes_per_key) {
      auto ptr = reinterpret_cast<uint32_t*>(cptr);
@@ -321,12 +311,18 @@ void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
      if (count.first < half_size) {
        auto rb = random::threefry2x32_hash(key, count);
        ptr[count.first++] = rb.first;
-        copy_remaining(cptr, count.second, rb.second);
+        if (bytes_per_key % 4 > 0) {
+          std::copy(
+              reinterpret_cast<char*>(&rb.second),
+              reinterpret_cast<char*>(&rb.second) + bytes_per_key % 4,
+              cptr + 4 * count.second);
+        } else {
+          ptr[count.second] = rb.second;
+        }
      }
      if (!even) {
        count.second = 0;
-        copy_remaining(
-            cptr, half_size, random::threefry2x32_hash(key, count).first);
+        ptr[half_size] = random::threefry2x32_hash(key, count).first;
      }
    }
  });
@@ -338,7 +334,7 @@ void Reshape::eval_cpu(const std::vector<array>& inputs, array& out) {

 void DynamicSlice::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (out.size() == 0) {
-    out.set_data(allocator::malloc(0));
+    out.set_data(nullptr);
    return;
  }
  auto& in = inputs[0];
@@ -366,7 +362,7 @@ void DynamicSliceUpdate::eval_cpu(
    const std::vector<array>& inputs,
    array& out) {
  if (out.size() == 0) {
-    out.set_data(allocator::malloc(0));
+    out.set_data(nullptr);
    return;
  }

@@ -401,7 +397,7 @@ void DynamicSliceUpdate::eval_cpu(
 void SliceUpdate::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  if (out.size() == 0) {
-    out.set_data(allocator::malloc(0));
+    out.set_data(nullptr);
    return;
  }

--- a/mlx/backend/cpu/qrf.cpp
+++ b/mlx/backend/cpu/qrf.cpp
@@ -13,7 +13,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
  const int M = a.shape(-2);
  const int N = a.shape(-1);
  const int lda = M;
-  size_t num_matrices = a.size() / (M * N);
+  int64_t num_matrices = a.size() / (M * N);

  // Copy A to inplace input and make it col-contiguous
  array in(a.shape(), a.dtype(), nullptr, {});
@@ -54,7 +54,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
    auto work = allocator::malloc(sizeof(T) * lwork);

    // Loop over matrices
-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      // Solve
      geqrf<T>(
          &M,
@@ -68,7 +68,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
    }
    allocator::free(work);

-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      /// num_reflectors x N
      for (int j = 0; j < num_reflectors; ++j) {
        for (int k = 0; k < j; ++k) {
@@ -97,7 +97,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
    work = allocator::malloc(sizeof(T) * lwork);

    // Loop over matrices
-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      // Compute Q
      orgqr<T>(
          &M,
@@ -111,7 +111,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
          &info);
    }

-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      // M x num_reflectors
      for (int j = 0; j < M; ++j) {
        for (int k = 0; k < num_reflectors; ++k) {
--- a/mlx/backend/cpu/quantized.cpp
+++ b/mlx/backend/cpu/quantized.cpp
@@ -1,11 +1,8 @@
 // Copyright © 2023 Apple Inc.

-#include "mlx/backend/common/unary.h"
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"
-#include "mlx/backend/cpu/unary.h"
-#include "mlx/backend/cpu/unary_ops.h"
 #include "mlx/fast_primitives.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"
@@ -1105,44 +1102,4 @@ void fast::Quantize::eval_cpu(
  });
 }

-void fast::ConvertFP8::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  auto& in = inputs[0];
-  auto& out = outputs[0];
-  set_unary_output_data(in, out);
-  auto& encoder = cpu::get_command_encoder(stream());
-  encoder.set_input_array(in);
-  encoder.set_output_array(out);
-  encoder.dispatch([in = array::unsafe_weak_copy(in),
-                    out = array::unsafe_weak_copy(out),
-                    to_fp8 = to_fp8_]() mutable {
-    if (to_fp8) {
-      switch (in.dtype()) {
-        case float16:
-          unary_op<float16_t, uint8_t>(in, out, detail::ToFP8());
-          break;
-        case bfloat16:
-          unary_op<bfloat16_t, uint8_t>(in, out, detail::ToFP8());
-          break;
-        default:
-          unary_op<float, uint8_t>(in, out, detail::ToFP8());
-          break;
-      }
-    } else {
-      switch (out.dtype()) {
-        case float16:
-          unary_op<uint8_t, float16_t>(in, out, detail::FromFP8());
-          break;
-        case bfloat16:
-          unary_op<uint8_t, bfloat16_t>(in, out, detail::FromFP8());
-          break;
-        default:
-          unary_op<uint8_t, float>(in, out, detail::FromFP8());
-          break;
-      }
-    }
-  });
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cpu/simd/accelerate_simd.h
+++ b/mlx/backend/cpu/simd/accelerate_simd.h
@@ -1,6 +1,5 @@
 #pragma once

-#include <arm_neon.h>
 #include <simd/math.h>
 #include <simd/vector.h>

@@ -201,15 +200,6 @@ SIMD_DEFAULT_COMPARISONS(<=)
 SIMD_DEFAULT_COMPARISONS(==)
 SIMD_DEFAULT_COMPARISONS(!=)

-template <typename T, int N>
-Simd<T, N> clz(Simd<T, N> x) {
-  auto a = *(uint32x4_t*)(&x);
-  auto b = *((uint32x4_t*)(&x) + 1);
-  a = vclzq_u32(a);
-  b = vclzq_u32(b);
-  return asd::make_uint8(a, b);
-}
-
 template <typename T, int N>
 Simd<T, N> atan2(Simd<T, N> a, Simd<T, N> b) {
  return asd::atan2(a.value, b.value);
@@ -217,20 +207,14 @@ Simd<T, N> atan2(Simd<T, N> a, Simd<T, N> b) {

 template <typename T, int N>
 Simd<T, N> maximum(Simd<T, N> a, Simd<T, N> b) {
-  auto out = Simd<T, N>(asd::max(a.value, b.value));
-  if constexpr (!std::is_integral_v<T>) {
-    out = select(isnan(b), b, select(isnan(a), a, out));
-  }
-  return out;
+  // TODO add isnan
+  return asd::max(a.value, b.value);
 }

 template <typename T, int N>
 Simd<T, N> minimum(Simd<T, N> a, Simd<T, N> b) {
-  auto out = Simd<T, N>(asd::min(a.value, b.value));
-  if constexpr (!std::is_integral_v<T>) {
-    out = select(isnan(b), b, select(isnan(a), a, out));
-  }
-  return out;
+  // TODO add isnan
+  return asd::min(a.value, b.value);
 }

 template <typename T, int N>
@@ -269,12 +253,12 @@ Simd<T, N> pow(Simd<T, N> base, Simd<T, N> exp) {
  } else {
    Simd<T, N> res = 1;
    // Raising an integer to a negative power is undefined
-    if (any(exp < 0)) {
+    if (any(exp < static_cast<T>(0))) {
      return 0;
    }
-    while (any(exp > 0)) {
+    while (any(exp > static_cast<T>(0))) {
      res = select((exp & 1) != 0, res * base, res);
-      base = select(exp > 0, base * base, base);
+      base = select(exp > static_cast<T>(0), base * base, base);
      exp = exp >> 1;
    }
    return res;
--- a/mlx/backend/cpu/simd/base_simd.h
+++ b/mlx/backend/cpu/simd/base_simd.h
@@ -171,11 +171,6 @@ DEFAULT_BINARY(&)
 DEFAULT_BINARY(&&)
 DEFAULT_BINARY(||)

-template <typename T>
-Simd<T, 1> clz(Simd<T, 1> x_) {
-  return __builtin_clz(x_.value);
-}
-
 template <typename T>
 Simd<T, 1> remainder(Simd<T, 1> a_, Simd<T, 1> b_) {
  T a = a_.value;
--- a/mlx/backend/cpu/simd/math.h
+++ b/mlx/backend/cpu/simd/math.h
@@ -79,7 +79,8 @@ Simd<T, N> sincos(Simd<T, N> in) {

  // Get the polynom selection mask. There is one polynom for 0 <= x <= Pi/4
  // and another one for Pi/4<x<=Pi/2. Both branches will be computed.
-  auto poly_mask = (emm2 & 2) != 0;
+  auto poly_mask =
+      (emm2 & static_cast<uint32_t>(2)) != static_cast<uint32_t>(0);

  // The magic pass: "Extended precision modular arithmetic"
  // x = ((x - y * DP1) - y * DP2) - y * DP3
@@ -87,8 +88,8 @@ Simd<T, N> sincos(Simd<T, N> in) {
  x = fma(y, Simd<float, N>(-2.4187564849853515625e-4f), x);
  x = fma(y, Simd<float, N>(-3.77489497744594108e-8f), x);

-  sign_mask_sin = sign_mask_sin ^ ((emm2 & 4) != 0);
-  auto sign_mask_cos = ((emm2 - 2) & 4) != 0;
+  sign_mask_sin = sign_mask_sin ^ ((emm2 & 4) != static_cast<uint32_t>(0));
+  auto sign_mask_cos = ((emm2 - 2) & 4) != static_cast<uint32_t>(0);

  // Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
  // and the second polynom      (Pi/4 <= x <= 0) in y2
--- a/mlx/backend/cpu/simd/type.h
+++ b/mlx/backend/cpu/simd/type.h
@@ -3,9 +3,5 @@
 #include "mlx/backend/cpu/simd/base_simd.h"

 #ifdef MLX_USE_ACCELERATE
-#if defined(__x86_64__)
-// the accelerate_simd implementation require neon -- use base implementation
-#else
 #include "mlx/backend/cpu/simd/accelerate_simd.h"
 #endif
-#endif
--- a/mlx/backend/cpu/sort.cpp
+++ b/mlx/backend/cpu/sort.cpp
@@ -120,8 +120,8 @@ template <typename T>
 void sort(array& out, int axis) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + out.ndim() : axis;
-  size_t in_size = out.size();
-  size_t n_rows = in_size / out.shape(axis);
+  int64_t in_size = out.size();
+  int64_t n_rows = in_size / out.shape(axis);

  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);
@@ -136,7 +136,7 @@ void sort(array& out, int axis) {
  ContiguousIterator src_it(
      remaining_shape, remaining_strides, remaining_shape.size());
  auto out_ptr = out.data<T>();
-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    T* data_ptr = out_ptr + src_it.loc;

    StridedIterator st(data_ptr, axis_stride, 0);
@@ -151,7 +151,7 @@ template <typename T, typename IdxT = uint32_t>
 void argsort(const array& in, array& out, int axis) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + in.ndim() : axis;
-  size_t n_rows = in.size() / in.shape(axis);
+  int64_t n_rows = in.size() / in.shape(axis);

  auto in_remaining_shape = in.shape();
  in_remaining_shape.erase(in_remaining_shape.begin() + axis);
@@ -176,7 +176,7 @@ void argsort(const array& in, array& out, int axis) {
      out_remaining_shape, out_remaining_strides, out_remaining_shape.size());
  auto in_ptr = in.data<T>();
  auto out_ptr = out.data<IdxT>();
-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    const T* data_ptr = in_ptr + in_it.loc;
    IdxT* idx_ptr = out_ptr + out_it.loc;

@@ -214,8 +214,8 @@ template <typename T>
 void partition(array& out, int axis, int kth) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + out.ndim() : axis;
-  size_t in_size = out.size();
-  size_t n_rows = in_size / out.shape(axis);
+  int64_t in_size = out.size();
+  int64_t n_rows = in_size / out.shape(axis);

  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);
@@ -232,7 +232,7 @@ void partition(array& out, int axis, int kth) {
  ContiguousIterator src_it(
      remaining_shape, remaining_strides, remaining_shape.size());
  auto out_ptr = out.data<T>();
-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    T* data_ptr = out_ptr + src_it.loc;
    src_it.step();

@@ -248,7 +248,7 @@ template <typename T, typename IdxT = uint32_t>
 void argpartition(const array& in, array& out, int axis, int kth) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + in.ndim() : axis;
-  size_t n_rows = in.size() / in.shape(axis);
+  int64_t n_rows = in.size() / in.shape(axis);

  auto in_remaining_shape = in.shape();
  in_remaining_shape.erase(in_remaining_shape.begin() + axis);
@@ -277,7 +277,7 @@ void argpartition(const array& in, array& out, int axis, int kth) {
  auto in_ptr = in.data<T>();
  auto out_ptr = out.data<IdxT>();

-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    const T* data_ptr = in_ptr + in_it.loc;
    IdxT* idx_ptr = out_ptr + out_it.loc;
    in_it.step();
--- a/mlx/backend/cpu/svd.cpp
+++ b/mlx/backend/cpu/svd.cpp
@@ -8,183 +8,6 @@

 namespace mlx::core {

-template <typename T, class Enable = void>
-struct SVDWork {};
-
-template <typename T>
-struct SVDWork<
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  using R = T;
-
-  int N;
-  int M;
-  int K;
-  int lda;
-  int ldu;
-  int ldvt;
-  char jobz;
-  std::vector<array::Data> buffers;
-  int lwork;
-
-  SVDWork(int N, int M, int K, char jobz)
-      : N(N), M(M), K(K), lda(N), ldu(N), ldvt(M), jobz(jobz) {
-    T workspace_dimension = 0;
-
-    // Will contain the indices of eigenvectors that failed to converge (not
-    // used here but required by lapack).
-    buffers.emplace_back(allocator::malloc(sizeof(int) * 8 * K));
-
-    int lwork_query = -1;
-    int info;
-
-    // Compute workspace size.
-    gesdd<T>(
-        /* jobz = */ &jobz,
-        // M and N are swapped since lapack expects column-major.
-        /* m = */ &N,
-        /* n = */ &M,
-        /* a = */ nullptr,
-        /* lda = */ &lda,
-        /* s = */ nullptr,
-        /* u = */ nullptr,
-        /* ldu = */ &ldu,
-        /* vt = */ nullptr,
-        /* ldvt = */ &ldvt,
-        /* work = */ &workspace_dimension,
-        /* lwork = */ &lwork_query,
-        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
-        /* info = */ &info);
-
-    if (info != 0) {
-      std::stringstream ss;
-      ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
-      throw std::runtime_error(ss.str());
-    }
-
-    lwork = workspace_dimension;
-    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
-  }
-
-  void run(T* a, R* s, T* u, T* vt) {
-    int info;
-    gesdd<T>(
-        /* jobz = */ &jobz,
-        // M and N are swapped since lapack expects column-major.
-        /* m = */ &N,
-        /* n = */ &M,
-        /* a = */ a,
-        /* lda = */ &lda,
-        /* s = */ s,
-        // According to the identity above, lapack will write Vᵀᵀ as U.
-        /* u = */ u,
-        /* ldu = */ &ldu,
-        // According to the identity above, lapack will write Uᵀ as Vᵀ.
-        /* vt = */ vt,
-        /* ldvt = */ &ldvt,
-        /* work = */ static_cast<T*>(buffers[1].buffer.raw_ptr()),
-        /* lwork = */ &lwork,
-        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
-        /* info = */ &info);
-
-    if (info != 0) {
-      std::stringstream ss;
-      ss << "svd_impl: sgesvdx_ failed with code " << info;
-      throw std::runtime_error(ss.str());
-    }
-  }
-};
-
-template <>
-struct SVDWork<std::complex<float>> {
-  using T = std::complex<float>;
-  using R = float;
-
-  int N;
-  int M;
-  int K;
-  int lda;
-  int ldu;
-  int ldvt;
-  char jobz;
-  std::vector<array::Data> buffers;
-  int lwork;
-
-  SVDWork(int N, int M, int K, char jobz)
-      : N(N), M(M), K(K), lda(N), ldu(N), ldvt(M), jobz(jobz) {
-    T workspace_dimension = 0;
-
-    // Will contain the indices of eigenvectors that failed to converge (not
-    // used here but required by lapack).
-    buffers.emplace_back(allocator::malloc(sizeof(int) * 8 * K));
-
-    const int lrwork =
-        jobz == 'A' ? std::max(1, 5 * K * K + 5 * K) : std::max(1, 7 * K);
-    buffers.emplace_back(allocator::malloc(sizeof(float) * lrwork));
-
-    int lwork_query = -1;
-    int work_query = -1;
-    int info;
-
-    // Compute workspace size.
-    gesdd<T>(
-        /* jobz = */ &jobz,
-        // M and N are swapped since lapack expects column-major.
-        /* m = */ &N,
-        /* n = */ &M,
-        /* a = */ nullptr,
-        /* lda = */ &lda,
-        /* s = */ nullptr,
-        /* u = */ nullptr,
-        /* ldu = */ &ldu,
-        /* vt = */ nullptr,
-        /* ldvt = */ &ldvt,
-        /* work = */ &workspace_dimension,
-        /* lwork = */ &lwork_query,
-        /* rwork = */ static_cast<float*>(buffers[1].buffer.raw_ptr()),
-        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
-        /* info = */ &info);
-
-    if (info != 0) {
-      std::stringstream ss;
-      ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
-      throw std::runtime_error(ss.str());
-    }
-
-    lwork = workspace_dimension.real();
-    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
-  }
-
-  void run(T* a, R* s, T* u, T* vt) {
-    int info;
-    gesdd<T>(
-        /* jobz = */ &jobz,
-        // M and N are swapped since lapack expects column-major.
-        /* m = */ &N,
-        /* n = */ &M,
-        /* a = */ a,
-        /* lda = */ &lda,
-        /* s = */ s,
-        // According to the identity above, lapack will write Vᵀᵀ as U.
-        /* u = */ u,
-        /* ldu = */ &ldu,
-        // According to the identity above, lapack will write Uᵀ as Vᵀ.
-        /* vt = */ vt,
-        /* ldvt = */ &ldvt,
-        /* work = */ static_cast<T*>(buffers[2].buffer.raw_ptr()),
-        /* lwork = */ &lwork,
-        /* rwork = */ static_cast<float*>(buffers[1].buffer.raw_ptr()),
-        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
-        /* info = */ &info);
-
-    if (info != 0) {
-      std::stringstream ss;
-      ss << "svd_impl: sgesvdx_ failed with code " << info;
-      throw std::runtime_error(ss.str());
-    }
-  }
-};
-
 template <typename T>
 void svd_impl(
    const array& a,
@@ -204,9 +27,7 @@ void svd_impl(
  const int N = a.shape(-1);
  const int K = std::min(M, N);

-  using R = typename SVDWork<T>::R;
-
-  size_t num_matrices = a.size() / (M * N);
+  int64_t num_matrices = a.size() / (M * N);

  // lapack clobbers the input, so we have to make a copy.
  array in(a.shape(), a.dtype(), nullptr, {});
@@ -221,7 +42,7 @@ void svd_impl(
  encoder.set_input_array(a);
  auto in_ptr = in.data<T>();
  T* u_ptr;
-  R* s_ptr;
+  T* s_ptr;
  T* vt_ptr;

  if (compute_uv) {
@@ -237,7 +58,7 @@ void svd_impl(
    encoder.set_output_array(s);
    encoder.set_output_array(vt);

-    s_ptr = s.data<R>();
+    s_ptr = s.data<T>();
    u_ptr = u.data<T>();
    vt_ptr = vt.data<T>();
  } else {
@@ -247,26 +68,96 @@ void svd_impl(

    encoder.set_output_array(s);

-    s_ptr = s.data<R>();
+    s_ptr = s.data<T>();
    u_ptr = nullptr;
    vt_ptr = nullptr;
  }

  encoder.dispatch([in_ptr, u_ptr, s_ptr, vt_ptr, M, N, K, num_matrices]() {
-    auto jobz = (u_ptr) ? 'A' : 'N';
-    SVDWork<T> svd_work(N, M, K, jobz);
+    // A of shape M x N. The leading dimension is N since lapack receives Aᵀ.
+    const int lda = N;
+    // U of shape M x M. (N x N in lapack).
+    const int ldu = N;
+    // Vᵀ of shape N x N. (M x M in lapack).
+    const int ldvt = M;
+
+    auto jobz = (u_ptr) ? "A" : "N";
+
+    T workspace_dimension = 0;
+
+    // Will contain the indices of eigenvectors that failed to converge (not
+    // used here but required by lapack).
+    auto iwork = array::Data{allocator::malloc(sizeof(int) * 8 * K)};
+
+    static const int lwork_query = -1;
+
+    int info;
+
+    // Compute workspace size.
+    gesdd<T>(
+        /* jobz = */ jobz,
+        // M and N are swapped since lapack expects column-major.
+        /* m = */ &N,
+        /* n = */ &M,
+        /* a = */ nullptr,
+        /* lda = */ &lda,
+        /* s = */ nullptr,
+        /* u = */ nullptr,
+        /* ldu = */ &ldu,
+        /* vt = */ nullptr,
+        /* ldvt = */ &ldvt,
+        /* work = */ &workspace_dimension,
+        /* lwork = */ &lwork_query,
+        /* iwork = */ static_cast<int*>(iwork.buffer.raw_ptr()),
+        /* info = */ &info);
+
+    if (info != 0) {
+      std::stringstream ss;
+      ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
+      throw std::runtime_error(ss.str());
+    }
+
+    const int lwork = workspace_dimension;
+    auto scratch = array::Data{allocator::malloc(sizeof(T) * lwork)};
+
    // Loop over matrices.
-    for (int i = 0; i < num_matrices; i++) {
-      svd_work.run(
-          in_ptr + M * N * i,
-          s_ptr + K * i,
-          vt_ptr ? vt_ptr + N * N * i : nullptr,
-          u_ptr ? u_ptr + M * M * i : nullptr);
+    for (int64_t i = 0; i < num_matrices; i++) {
+      gesdd<T>(
+          /* jobz = */ jobz,
+          // M and N are swapped since lapack expects column-major.
+          /* m = */ &N,
+          /* n = */ &M,
+          /* a = */ in_ptr + M * N * i,
+          /* lda = */ &lda,
+          /* s = */ s_ptr + K * i,
+          // According to the identity above, lapack will write Vᵀᵀ as U.
+          /* u = */ vt_ptr ? vt_ptr + N * N * i : nullptr,
+          /* ldu = */ &ldu,
+          // According to the identity above, lapack will write Uᵀ as Vᵀ.
+          /* vt = */ u_ptr ? u_ptr + M * M * i : nullptr,
+          /* ldvt = */ &ldvt,
+          /* work = */ static_cast<T*>(scratch.buffer.raw_ptr()),
+          /* lwork = */ &lwork,
+          /* iwork = */ static_cast<int*>(iwork.buffer.raw_ptr()),
+          /* info = */ &info);
+
+      if (info != 0) {
+        std::stringstream ss;
+        ss << "svd_impl: sgesvdx_ failed with code " << info;
+        throw std::runtime_error(ss.str());
+      }
    }
  });
  encoder.add_temporary(in);
 }

+template <typename T>
+void compute_svd(
+    const array& /* a */,
+    bool /* compute_uv */,
+    std::vector<array>& /* outputs */,
+    Stream /* stream */) {}
+
 void SVD::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
@@ -277,12 +168,9 @@ void SVD::eval_cpu(
    case float64:
      svd_impl<double>(inputs[0], outputs, compute_uv_, stream());
      break;
-    case complex64:
-      svd_impl<std::complex<float>>(inputs[0], outputs, compute_uv_, stream());
-      break;
    default:
      throw std::runtime_error(
-          "[SVD::eval_cpu] only supports float32, float64, or complex64.");
+          "[SVD::eval_cpu] only supports float32 or float64.");
  }
 }

--- a/mlx/backend/cpu/ternary.h
+++ b/mlx/backend/cpu/ternary.h
@@ -136,7 +136,7 @@ void ternary_op(
  if (topt == TernaryOpType::ScalarScalarScalar) {
    *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
  } else if (topt == TernaryOpType::VectorVectorVector) {
-    for (size_t i = 0; i < out.size(); ++i) {
+    for (int64_t i = 0; i < out.size(); ++i) {
      *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
      a_ptr++;
      b_ptr++;
--- a/mlx/backend/cpu/unary.h
+++ b/mlx/backend/cpu/unary.h
@@ -10,8 +10,8 @@
 namespace mlx::core {

 template <typename T, typename U = T, typename Op>
-void unary_op(const T* a, U* out, size_t shape, size_t stride) {
-  for (size_t i = 0; i < shape; i += 1) {
+void unary_op(const T* a, U* out, int64_t shape, int64_t stride) {
+  for (int64_t i = 0; i < shape; i += 1) {
    out[i] = Op{}(*a);
    a += stride;
  }
@@ -24,9 +24,9 @@ void unary_op(const array& a, array& out, Op) {
  auto ndim = a.ndim();
  if (a.flags().contiguous) {
    auto size = a.data_size();
-    constexpr int N = std::min(simd::max_size<T>, simd::max_size<U>);
+    constexpr int N = simd::max_size<T>;
    while (size >= N) {
-      simd::store(dst, simd::Simd<U, N>(Op{}(simd::load<T, N>(src))));
+      simd::store(dst, Op{}(simd::load<T, N>(src)));
      size -= N;
      src += N;
      dst += N;
@@ -38,14 +38,14 @@ void unary_op(const array& a, array& out, Op) {
      src++;
    }
  } else {
-    size_t shape = ndim > 0 ? a.shape().back() : 1;
-    size_t stride = ndim > 0 ? a.strides().back() : 1;
+    int64_t shape = ndim > 0 ? a.shape().back() : 1;
+    int64_t stride = ndim > 0 ? a.strides().back() : 1;
    if (ndim <= 1) {
      unary_op<T, U, Op>(src, dst, shape, stride);
      return;
    }
    auto it = ContiguousIterator(a.shape(), a.strides(), ndim - 1);
-    for (size_t elem = 0; elem < a.size(); elem += shape) {
+    for (int64_t elem = 0; elem < a.size(); elem += shape) {
      unary_op<T, U, Op>(src + it.loc, dst + elem, shape, stride);
      it.step();
    }
--- a/mlx/backend/cpu/unary_ops.h
+++ b/mlx/backend/cpu/unary_ops.h
@@ -108,73 +108,4 @@ struct Square {
  SINGLE()
 };

-template <int N>
-Simd<float, N> fp32_from_bits(Simd<uint32_t, N> x) {
-  return *(Simd<float, N>*)(&x);
-}
-template <int N>
-Simd<uint32_t, N> fp32_to_bits(Simd<float, N> x) {
-  return *(Simd<uint32_t, N>*)(&x);
-}
-
-struct ToFP8 {
-  template <typename T, int N>
-  Simd<uint8_t, N> operator()(Simd<T, N> f) {
-    uint32_t fp8_max = 543 << 21;
-    auto denorm_mask = Simd<uint32_t, N>(141 << 23);
-    Simd<uint32_t, N> f_bits;
-    Simd<float, N> f32 = f;
-    f_bits = fp32_to_bits(f32);
-    Simd<uint8_t, N> result = 0u;
-    auto sign = f_bits & 0x80000000;
-    f_bits = f_bits ^ sign;
-
-    auto f_bits_low =
-        fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
-    auto result_low = Simd<uint8_t, N>(f_bits_low - denorm_mask);
-
-    auto mant_odd = Simd<uint8_t, N>((f_bits >> 20) & 1);
-    auto f_bits_high = f_bits + (((uint32_t)(7 - 127) << 23) + 0x7FFFF);
-    f_bits_high = f_bits_high + Simd<uint32_t, N>(mant_odd);
-
-    auto result_high = Simd<uint8_t, N>(f_bits_high >> 20);
-    result = select(f_bits < (121 << 23), result_low, result_high);
-
-    auto result_sat = Simd<uint8_t, N>(0x7E);
-    result = select(f_bits >= fp8_max, result_sat, result);
-    return result | Simd<uint8_t, N>(sign >> 24);
-  }
-
-  template <typename T>
-  uint8_t operator()(T x) {
-    return (*this)(Simd<T, 1>(x)).value;
-  }
-};
-
-struct FromFP8 {
-  template <int N>
-  Simd<float, N> operator()(Simd<uint8_t, N> x) {
-    auto w = Simd<uint32_t, N>(x) << 24;
-    auto sign = w & 0x80000000;
-    auto nonsign = w & 0x7FFFFFFF;
-
-    auto renorm_shift = clz(nonsign);
-    renorm_shift = simd::select(
-        renorm_shift > Simd<uint32_t, N>{4},
-        renorm_shift - Simd<uint32_t, N>{4},
-        Simd<uint32_t, N>{0});
-
-    Simd<int32_t, N> inf_nan_mask =
-        (Simd<int32_t, N>(nonsign + 0x01000000) >> 8) & 0x7F800000;
-    auto zero_mask = Simd<int32_t, N>(nonsign - 1) >> 31;
-    auto result = sign |
-        ((((nonsign << renorm_shift >> 4) + ((0x78 - renorm_shift) << 23)) |
-          inf_nan_mask) &
-         ~zero_mask);
-    return fp32_from_bits(result);
-  }
-  float operator()(uint8_t x) {
-    return (*this)(Simd<uint8_t, 1>(x)).value;
-  }
-};
 } // namespace mlx::core::detail
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -32,7 +32,6 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/layer_norm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
@@ -44,7 +43,6 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/row_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
@@ -53,19 +51,12 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/fp_quantize.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/convert_fp8.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)

 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)

-# fp4 is not available on < 12.8
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8.0)
-  target_include_directories(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/)
-endif()
-
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9.0)
  target_sources(
    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm_batched_12_9.cu)
@@ -123,21 +114,10 @@ if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
    mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--compress-mode=size>")
 endif()

-# Use native CUDA arch by default.
+# Compute capability >= 7.0 is required for synchronization between CPU/GPU with
+# managed memory.
 if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
-  execute_process(
-    COMMAND __nvcc_device_query
-    OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  set(UPGRADABLE_ARCHITECTURES "90;100;121")
-  if(MLX_CUDA_ARCHITECTURES STREQUAL "")
-    message(
-      FATAL_ERROR
-        "Can not get native CUDA arch, must set MLX_CUDA_ARCHITECTURES")
-  elseif(MLX_CUDA_ARCHITECTURES IN_LIST UPGRADABLE_ARCHITECTURES)
-    # Use arch-specific compute capability whenever possible.
-    set(MLX_CUDA_ARCHITECTURES "${MLX_CUDA_ARCHITECTURES}a")
-  endif()
+  set(MLX_CUDA_ARCHITECTURES "native")
 endif()
 message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
@@ -149,7 +129,6 @@ FetchContent_Declare(
  URL "https://github.com/NVIDIA/cccl/releases/download/v2.8.1/cccl-v2.8.1.zip")
 FetchContent_MakeAvailable(cccl)
 target_include_directories(mlx BEFORE PRIVATE "${cccl_SOURCE_DIR}/include")
-set_target_properties(mlx PROPERTIES CCCL_DIR "${cccl_SOURCE_DIR}/include")

 # Use fixed version of NVTX.
 FetchContent_Declare(
@@ -175,7 +154,7 @@ target_link_libraries(mlx PRIVATE CUDA::nvrtc CUDA::cuda_driver)
 FetchContent_Declare(
  cudnn
  GIT_REPOSITORY https://github.com/NVIDIA/cudnn-frontend.git
-  GIT_TAG v1.16.0
+  GIT_TAG v1.14.0
  GIT_SHALLOW TRUE
  EXCLUDE_FROM_ALL)
 set(CUDNN_FRONTEND_SKIP_JSON_LIB ON)
@@ -191,6 +170,11 @@ target_link_libraries(mlx PRIVATE CUDNN::cudnn_all)
 # Suppress nvcc warnings on MLX headers.
 target_compile_options(mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe
                                   --diag_suppress=997>)
+# Supress warnings: note: parameter passing for argument of type
+# ‘std::pair<float, float>’ when C++17 is enabled changed to match C++14 in GCC
+# 10.1
+target_compile_options(mlx PRIVATE -Wno-psabi)
+
 # Install CCCL headers for JIT.
 install(DIRECTORY ${cccl_SOURCE_DIR}/include/cuda
        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cccl)
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -1,7 +1,6 @@
 // Copyright © 2025 Apple Inc.

 #include "mlx/backend/cuda/allocator.h"
-#include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/utils.h"
 #include "mlx/utils.h"

@@ -20,19 +19,6 @@ constexpr int page_size = 16384;
 // Any allocations smaller than this will try to use the small pool
 constexpr int small_block_size = 8;

-#if CUDART_VERSION >= 13000
-inline cudaMemLocation cuda_mem_loc(int i) {
-  cudaMemLocation loc;
-  loc.type = cudaMemLocationTypeDevice;
-  loc.id = i;
-  return loc;
-}
-#else
-inline int cuda_mem_loc(int i) {
-  return i;
-}
-#endif // CUDART_VERSION >= 13000
-
 // The small pool size in bytes. This should be a multiple of the host page
 // size and small_block_size.
 constexpr int small_pool_size = 4 * page_size;
@@ -48,7 +34,13 @@ SmallSizePool::SmallSizePool() {
  int device_count = 0;
  CHECK_CUDA_ERROR(cudaGetDeviceCount(&device_count));
  for (int i = 0; i < device_count; ++i) {
-    auto loc = cuda_mem_loc(i);
+#if CUDART_VERSION >= 13000
+    cudaMemLocation loc;
+    loc.type = cudaMemLocationTypeDevice;
+    loc.id = i;
+#else
+    int loc = i;
+#endif // CUDART_VERSION >= 13000
    CHECK_CUDA_ERROR(
        cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetAccessedBy, loc));
  }
@@ -75,7 +67,6 @@ CudaBuffer* SmallSizePool::malloc() {
  next_free_ = next_free_->next;
  b->buf.data = static_cast<char*>(data_) + i * small_block_size;
  b->buf.size = small_block_size;
-  b->buf.device = -1;
  return &b->buf;
 }

@@ -97,47 +88,16 @@ CudaAllocator::CudaAllocator()
          page_size,
          [](CudaBuffer* buf) { return buf->size; },
          [this](CudaBuffer* buf) { cuda_free(buf); }) {
-  size_t free;
-  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total_memory_));
-  memory_limit_ = total_memory_ * 0.95;
-  free_limit_ = total_memory_ - memory_limit_;
+  // TODO: Set memory limit for multi-device.
+  size_t free, total;
+  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
+  memory_limit_ = total * 0.95;
  max_pool_size_ = memory_limit_;
-
-  int device_count = 0;
-  CHECK_CUDA_ERROR(cudaGetDeviceCount(&device_count));
-  int curr;
-  CHECK_CUDA_ERROR(cudaGetDevice(&curr));
-  for (int i = 0; i < device_count; ++i) {
-    CHECK_CUDA_ERROR(cudaSetDevice(i));
-    cudaStream_t s;
-    CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking));
-    free_streams_.push_back(s);
-
-    cudaMemPool_t mem_pool;
-    CHECK_CUDA_ERROR(cudaDeviceGetDefaultMemPool(&mem_pool, i));
-    mem_pools_.push_back(mem_pool);
-  }
-  CHECK_CUDA_ERROR(cudaSetDevice(curr));
 }

-void copy_to_managed(CudaBuffer& buf) {
-  // TODO maybe make this async on a i/o stream to avoid synchronizing the
-  // device on malloc/and free
-  void* new_data;
-  CHECK_CUDA_ERROR(cudaMallocManaged(&new_data, buf.size));
-  buf.device = -1;
-  CHECK_CUDA_ERROR(cudaMemcpy(new_data, buf.data, buf.size, cudaMemcpyDefault));
-  CHECK_CUDA_ERROR(cudaFree(buf.data));
-  buf.data = new_data;
-}
-
-Buffer
-CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
-  if (size == 0) {
-    return Buffer{new CudaBuffer{nullptr, 0, -1}};
-  }
-
+Buffer CudaAllocator::malloc(size_t size) {
  // Find available buffer from cache.
+  auto orig_size = size;
  std::unique_lock lock(mutex_);
  if (size <= small_block_size) {
    size = 8;
@@ -147,10 +107,6 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
    size = page_size * ((size + page_size - 1) / page_size);
  }

-  if (size <= small_block_size || stream == nullptr) {
-    device = -1;
-  }
-
  CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
  if (!buf) {
    // If we have a lot of memory pressure try to reclaim memory from the cache.
@@ -166,63 +122,30 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
    }
    lock.unlock();
    if (!buf) {
-      void* data = nullptr;
-      if (device == -1) {
-        CHECK_CUDA_ERROR(cudaMallocManaged(&data, size));
-      } else {
-        CHECK_CUDA_ERROR(cudaMallocAsync(&data, size, stream));
+      buf = new CudaBuffer{nullptr, size};
+      cudaError_t err = cudaMallocManaged(&buf->data, size);
+      if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
+        throw std::runtime_error(fmt::format(
+            "cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
      }
-      if (!data) {
-        std::ostringstream msg;
-        msg << "[malloc] Unable to allocate " << size << " bytes.";
-        throw std::runtime_error(msg.str());
-      }
-      buf = new CudaBuffer{data, size, device};
    }
    lock.lock();
-
-    // If any cuda memory pool has too much reserved memory, clear some
-    // memory from the cache. This prevents graph / kernel execution failing
-    // from OOM
-    if (get_cache_memory() > 0) {
-      for (auto p : mem_pools_) {
-        size_t used = 0;
-        CHECK_CUDA_ERROR(cudaMemPoolGetAttribute(
-            p, cudaMemPoolAttrReservedMemCurrent, &used));
-        if (used > (total_memory_ - free_limit_)) {
-          buffer_cache_.release_cached_buffers(free_limit_);
-          break;
-        }
-      }
-    }
  }
-  active_memory_ += buf->size;
+  active_memory_ += size;
  peak_memory_ = std::max(active_memory_, peak_memory_);

  // Maintain the cache below the requested limit.
  if (get_cache_memory() > max_pool_size_) {
    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
  }
-  // Copy to managed here if the buffer is not on the right device
-  if (buf->device >= 0 && buf->device != device) {
-    copy_to_managed(*buf);
-  }
  return Buffer{buf};
 }

-Buffer CudaAllocator::malloc(size_t size) {
-  return malloc_async(size, -1, nullptr);
-}
-
 void CudaAllocator::free(Buffer buffer) {
  auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
  if (!buf) {
    return;
  }
-  if (buf->size == 0) {
-    delete buf;
-    return;
-  }

  std::unique_lock lock(mutex_);
  active_memory_ -= buf->size;
@@ -246,11 +169,7 @@ void CudaAllocator::cuda_free(CudaBuffer* buf) {
  if (scalar_pool_.in_pool(buf)) {
    scalar_pool_.free(buf);
  } else {
-    if (buf->device >= 0) {
-      CHECK_CUDA_ERROR(cudaFreeAsync(buf->data, free_streams_[buf->device]));
-    } else {
-      CHECK_CUDA_ERROR(cudaFree(buf->data));
-    }
+    cudaFree(buf->data);
    delete buf;
  }
 }
@@ -301,17 +220,6 @@ CudaAllocator& allocator() {
  return *allocator_;
 }

-Buffer malloc_async(size_t size, CommandEncoder& encoder) {
-  auto buffer = allocator().malloc_async(
-      size, encoder.device().cuda_device(), encoder.stream());
-  if (size && !buffer.ptr()) {
-    std::ostringstream msg;
-    msg << "[malloc_async] Unable to allocate " << size << " bytes.";
-    throw std::runtime_error(msg.str());
-  }
-  return buffer;
-}
-
 } // namespace cu

 namespace allocator {
@@ -324,11 +232,7 @@ void* Buffer::raw_ptr() {
  if (!ptr_) {
    return nullptr;
  }
-  auto& cbuf = *static_cast<cu::CudaBuffer*>(ptr_);
-  if (cbuf.device != -1) {
-    copy_to_managed(cbuf);
-  }
-  return cbuf.data;
+  return static_cast<cu::CudaBuffer*>(ptr_)->data;
 }

 } // namespace allocator
--- a/mlx/backend/cuda/allocator.h
+++ b/mlx/backend/cuda/allocator.h
@@ -4,24 +4,19 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/buffer_cache.h"
-#include "mlx/backend/cuda/cuda_utils.h"

-#include <cuda_runtime.h>
 #include <mutex>
 #include <set>
 #include <utility>

 namespace mlx::core::cu {

-class CommandEncoder;
-
 using allocator::Buffer;

 // Stores cuda-managed unified memory.
 struct CudaBuffer {
  void* data;
  size_t size;
-  int device; // -1 for managed
 };

 class SmallSizePool {
@@ -50,7 +45,6 @@ class SmallSizePool {
 class CudaAllocator : public allocator::Allocator {
 public:
  Buffer malloc(size_t size) override;
-  Buffer malloc_async(size_t size, int device, cudaStream_t stream);
  void free(Buffer buffer) override;
  size_t size(Buffer buffer) const override;

@@ -71,19 +65,13 @@ class CudaAllocator : public allocator::Allocator {

  std::mutex mutex_;
  size_t memory_limit_;
-  size_t free_limit_;
-  size_t total_memory_;
  size_t max_pool_size_;
  BufferCache<CudaBuffer> buffer_cache_;
  size_t active_memory_{0};
  size_t peak_memory_{0};
-  std::vector<cudaStream_t> free_streams_;
-  std::vector<cudaMemPool_t> mem_pools_;
  SmallSizePool scalar_pool_;
 };

 CudaAllocator& allocator();

-Buffer malloc_async(size_t size, CommandEncoder& encoder);
-
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/arange.cu
+++ b/mlx/backend/cuda/arange.cu
@@ -41,8 +41,9 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  if (out.size() == 0) {
    return;
  }
+  out.set_data(allocator::malloc(out.nbytes()));
+
  auto& encoder = cu::get_command_encoder(stream());
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  encoder.set_output_array(out);

  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
@@ -57,7 +58,7 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
          num_blocks,
          block_dims,
          0,
-          gpu_ptr<OutType>(out),
+          out.data<OutType>(),
          out.data_size(),
          static_cast<CTYPE>(start_),
          static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_));
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -140,10 +140,8 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("ArgReduce::eval_gpu");
  assert(inputs.size() == 1);
  auto& in = inputs[0];
-
+  out.set_data(allocator::malloc(out.nbytes()));
  auto& s = stream();
-  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  // Prepare the shapes, strides and axis arguments.
  Shape shape = remove_index(in.shape(), axis_);
@@ -156,6 +154,7 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  int32_t ndim = shape.size();

  // ArgReduce.
+  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
@@ -173,8 +172,8 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
          num_blocks,
          block_dim(),
          0,
-          gpu_ptr<T>(in),
-          gpu_ptr<uint32_t>(out),
+          in.data<T>(),
+          out.data<uint32_t>(),
          out.size(),
          const_param(shape),
          const_param(in_strides),
--- a/mlx/backend/cuda/binary/binary.cuh
+++ b/mlx/backend/cuda/binary/binary.cuh
@@ -292,9 +292,9 @@ void binary_op_gpu_inplace(
                        {num_blocks_x, num_blocks_y},
                        block_dims,
                        0,
-                        gpu_ptr<InType>(a),
-                        gpu_ptr<InType>(b),
-                        gpu_ptr<OutType>(out),
+                        a.data<InType>(),
+                        b.data<InType>(),
+                        out.data<OutType>(),
                        rest,
                        const_param<dims_constant()>(shape),
                        const_param<dims_constant()>(a_strides),
@@ -310,9 +310,9 @@ void binary_op_gpu_inplace(
                      {num_blocks_x, num_blocks_y},
                      block_dims,
                      0,
-                      gpu_ptr<InType>(a),
-                      gpu_ptr<InType>(b),
-                      gpu_ptr<OutType>(out),
+                      a.data<InType>(),
+                      b.data<InType>(),
+                      out.data<OutType>(),
                      rest,
                      const_param(shape),
                      const_param(a_strides),
@@ -339,9 +339,9 @@ void binary_op_gpu_inplace(
                num_blocks,
                block_dims,
                0,
-                gpu_ptr<InType>(a),
-                gpu_ptr<InType>(b),
-                gpu_ptr<OutType>(out),
+                a.data<InType>(),
+                b.data<InType>(),
+                out.data<OutType>(),
                out.data_size());
          });
        }
@@ -365,10 +365,7 @@ void binary_op_gpu(
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
-  auto& encoder = cu::get_command_encoder(s);
-
-  set_binary_op_output_data(
-      a, b, out, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
+  set_binary_op_output_data(a, b, out, bopt);
  binary_op_gpu_inplace<Op>(inputs, out, op, s);
 }

--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -245,16 +245,14 @@ void binary_two_op_gpu_inplace(
  auto& out_a = outputs[0];
  auto& out_b = outputs[1];
  auto bopt = get_binary_op_type(a, b);
-  auto& encoder = cu::get_command_encoder(s);
-  set_binary_op_output_data(
-      a, b, out_a, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
-  set_binary_op_output_data(
-      a, b, out_b, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
+  set_binary_op_output_data(a, b, out_a, bopt);
+  set_binary_op_output_data(a, b, out_b, bopt);

  if (out_a.size() == 0) {
    return;
  }

+  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out_a);
@@ -315,10 +313,10 @@ void binary_two_op_gpu_inplace(
                        {num_blocks_x, num_blocks_y},
                        block_dims,
                        0,
-                        gpu_ptr<InType>(a),
-                        gpu_ptr<InType>(b),
-                        gpu_ptr<OutType>(out_a),
-                        gpu_ptr<OutType>(out_b),
+                        a.data<InType>(),
+                        b.data<InType>(),
+                        out_a.data<OutType>(),
+                        out_b.data<OutType>(),
                        rest,
                        const_param<dims_constant()>(shape),
                        const_param<dims_constant()>(a_strides),
@@ -334,10 +332,10 @@ void binary_two_op_gpu_inplace(
                      {num_blocks_x, num_blocks_y},
                      block_dims,
                      0,
-                      gpu_ptr<InType>(a),
-                      gpu_ptr<InType>(b),
-                      gpu_ptr<OutType>(out_a),
-                      gpu_ptr<OutType>(out_b),
+                      a.data<InType>(),
+                      b.data<InType>(),
+                      out_a.data<OutType>(),
+                      out_b.data<OutType>(),
                      rest,
                      const_param(shape),
                      const_param(a_strides),
@@ -368,10 +366,10 @@ void binary_two_op_gpu_inplace(
                num_blocks,
                block_dims,
                0,
-                gpu_ptr<InType>(a),
-                gpu_ptr<InType>(b),
-                gpu_ptr<OutType>(out_a),
-                gpu_ptr<OutType>(out_b),
+                a.data<InType>(),
+                b.data<InType>(),
+                out_a.data<OutType>(),
+                out_b.data<OutType>(),
                out_a.data_size());
          });
        }
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -293,13 +293,8 @@ void Compiled::eval_gpu(
    }
  }

-  auto& encoder = cu::get_command_encoder(s);
-
  // Put outputs.
-  compiled_allocate_outputs(
-      inputs, outputs, is_constant_, contiguous, [&](auto n) {
-        return cu::malloc_async(n, encoder);
-      });
+  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);
  for (auto& x : outputs) {
    args.append(x);
  }
@@ -329,6 +324,7 @@ void Compiled::eval_gpu(
    kernel_name += fmt::format(
        "_strided<{}, {}, {}>", shape.size(), index_type, work_per_thread);
  }
+  auto& encoder = cu::get_command_encoder(s);
  for (const auto& in : inputs) {
    encoder.set_input_array(in);
  }
--- a/mlx/backend/cuda/conv.cpp
+++ b/mlx/backend/cuda/conv.cpp
@@ -15,16 +15,19 @@ namespace mlx::core {

 namespace {

-enum ConvBackendType {
-  CONV_FALLBACK,
-  CONV_FORWARD,
-  CONV_BACKWARD_INPUT,
-  CONV_BACKWARD_WEIGHT,
-};
+// Alias for better readability.
+#define CONV_FORWARD CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR
+#define CONV_BACKWARD_INPUT \
+  CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
+#define CONV_BACKWARD_WEIGHT \
+  CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
+
+// Custom placeholder representing fallback kernel.
+#define CONV_FALLBACK static_cast<cudnnBackendDescriptorType_t>(-1)

 struct ConvCacheKey {
  int device_id;
-  fe::DataType_t cudnn_dtype;
+  cudnnDataType_t cudnn_dtype;
  std::array<int, MAX_NDIM> input_shape;
  std::array<int, MAX_NDIM> weight_shape;
  std::array<int, MAX_NDIM> stride;
@@ -41,13 +44,15 @@ struct ConvCacheKey {
 auto& conv_cache() {
  static LRUBytesKeyCache<
      ConvCacheKey,
-      std::pair<ConvBackendType, std::optional<DnnGraph>>>
+      std::pair<
+          cudnnBackendDescriptorType_t,
+          std::optional<cudnn_frontend::ExecutionPlan>>>
      cache("MLX_CUDA_CONV_CACHE_SIZE", /* default_capacity */ 128);
  return cache;
 }

-auto get_conv_settings(
-    ConvBackendType backend_type,
+auto get_conv_op_settings(
+    cudnnBackendDescriptorType_t backend_type,
    array& x,
    array& w,
    array& y,
@@ -63,8 +68,8 @@ auto get_conv_settings(
    for (int i = 0; i < padding_lo.size(); ++i) {
      int wt_size = 1 + kernel_dilation[i] * (w.shape(1 + i) - 1);
      padding_lo[i] = wt_size - padding_lo[i] - 1;
-      int in_size = 1 + kernel_strides[i] * (y.shape(1 + i) - 1);
-      int out_size = 1 + input_dilation[i] * (x.shape(1 + i) - 1);
+      int in_size = 1 + kernel_strides[i] * (x.shape(1 + i) - 1);
+      int out_size = 1 + input_dilation[i] * (y.shape(1 + i) - 1);
      padding_hi[i] = out_size - in_size + padding_hi[i];
    }
    return std::make_tuple(
@@ -90,57 +95,49 @@ auto get_conv_settings(
  }
 }

-std::optional<DnnGraph> build_conv_graph(
+std::optional<cudnn_frontend::OperationGraph> build_conv_op_graph(
    cu::CommandEncoder& encoder,
-    ConvBackendType backend_type,
+    cudnnBackendDescriptorType_t backend_type,
    Dtype dtype,
    array& x,
    array& w,
    array& y,
-    const std::vector<int64_t>& stride,
-    const std::vector<int64_t>& padding_lo,
-    const std::vector<int64_t>& padding_hi,
-    const std::vector<int64_t>& dilation) {
-  auto compute_dtype =
-      (dtype == float16 || dtype == bfloat16) ? float32 : dtype;
-  DnnGraph graph(encoder.device().cudnn_handle(), dtype, compute_dtype);
-  auto x_ = graph.tensor_nchw("X", 'x', x);
-  auto w_ = graph.tensor_nchw("W", 'w', w);
+    const SmallVector<int64_t>& stride,
+    const SmallVector<int64_t>& padding_lo,
+    const SmallVector<int64_t>& padding_hi,
+    const SmallVector<int64_t>& dilation) {
+  try {
+    auto compute_dtype = (dtype == float16 || dtype == bfloat16)
+        ? CUDNN_DATA_FLOAT
+        : dtype_to_cudnn_type(dtype);
+    auto conv_desc = cudnn_frontend::ConvDescBuilder()
+                         .setDataType(compute_dtype)
+                         .setMathMode(CUDNN_CROSS_CORRELATION)
+                         .setNDims(stride.size())
+                         .setStrides(stride.size(), stride.data())
+                         .setPrePadding(padding_lo.size(), padding_lo.data())
+                         .setPostPadding(padding_hi.size(), padding_hi.data())
+                         .setDilation(dilation.size(), dilation.data())
+                         .build();

-  auto set_options = [&](auto& options) {
-    options.set_compute_data_type(dtype_to_cudnn_type(compute_dtype))
-        .set_convolution_mode(fe::ConvolutionMode_t::CROSS_CORRELATION)
-        .set_stride(stride)
-        .set_pre_padding(padding_lo)
-        .set_post_padding(padding_hi)
-        .set_dilation(dilation);
-  };
+    auto op = cudnn_frontend::OperationBuilder(backend_type)
+                  .setxDesc(build_cudnn_tensor_nchw('x', x))
+                  .setwDesc(build_cudnn_tensor_nchw('w', w))
+                  .setyDesc(build_cudnn_tensor_nchw('y', y))
+                  .setcDesc(conv_desc)
+                  .build();

-  std::shared_ptr<fe::graph::Tensor_attributes> y_;
-  if (backend_type == CONV_FORWARD) {
-    auto options = fe::graph::Conv_fprop_attributes();
-    set_options(options);
-    y_ = graph.conv_fprop(x_, w_, options);
-  } else if (backend_type == CONV_BACKWARD_INPUT) {
-    auto options = fe::graph::Conv_dgrad_attributes();
-    set_options(options);
-    y_ = graph.conv_dgrad(x_, w_, options);
-  } else if (backend_type == CONV_BACKWARD_WEIGHT) {
-    auto options = fe::graph::Conv_wgrad_attributes();
-    set_options(options);
-    y_ = graph.conv_wgrad(w_, x_, options);
-  }
-  graph.tensor_nchw(y_, 'y', y)->set_output(true);
-
-  if (graph.prepare().is_bad()) {
+    std::array<cudnn_frontend::Operation const*, 1> ops = {&op};
+    return cudnn_frontend::OperationGraphBuilder()
+        .setHandle(encoder.device().cudnn_handle())
+        .setOperationGraph(ops.size(), ops.data())
+        .build();
+  } catch (cudnn_frontend::cudnnException& error) {
+    if (error.getCudnnStatus() != CUDNN_STATUS_BAD_PARAM) {
+      throw;
+    }
    return std::nullopt;
  }
-  graph.deselect_numeric_notes({fe::NumericalNote_t::DOWN_CONVERT_INPUTS});
-  if (dtype == float32 && !env::enable_tf32()) {
-    graph.deselect_numeric_notes({fe::NumericalNote_t::TENSOR_CORE});
-  }
-  CHECK_CUDNN_FE_ERROR(graph.build());
-  return graph;
 }

 // Transpose from (C_out, H, W, C_in / groups) to (C_in, H, W, C_out / groups).
@@ -184,7 +181,7 @@ array group_transpose(
 // eval_gpu, with cost of possible redundant copies.
 std::tuple<array, array, array> prepare_args(
    cu::CommandEncoder& encoder,
-    ConvBackendType backend_type,
+    cudnnBackendDescriptorType_t backend_type,
    array in,
    array wt,
    array out,
@@ -224,11 +221,27 @@ std::tuple<array, array, array> prepare_args(
  return {std::move(in), std::move(wt), std::move(out)};
 }

+// Get the x/w/y args from the in/wt/out args depending on backend type.
+inline std::tuple<array&, array&, array&> dispatch_args(
+    cudnnBackendDescriptorType_t backend_type,
+    array& in,
+    array& wt,
+    array& out) {
+  switch (backend_type) {
+    case CONV_BACKWARD_INPUT:
+      return {out, wt, in};
+    case CONV_BACKWARD_WEIGHT:
+      return {in, out, wt};
+    default:
+      return {in, wt, out};
+  }
+}
+
 // Register inputs and outputs before actually running conv op. Can only be
 // called once per eval_gpu.
 void register_args(
    cu::CommandEncoder& encoder,
-    ConvBackendType backend_type,
+    cudnnBackendDescriptorType_t backend_type,
    array& in,
    array& wt,
    array& intermediate_out,
@@ -257,19 +270,19 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  if (out_.size() == 0) {
    return;
  }
-  auto& s = stream();
-  auto& encoder = cu::get_command_encoder(s);

  assert(inputs.size() == 2);
  array in = inputs[0];
  array wt = inputs[1];
  array out = out_;
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(allocator::malloc(out.nbytes()));
  Dtype dtype = out.dtype();

+  auto& s = stream();
+  auto& encoder = cu::get_command_encoder(s);
+
  // Search cache.
-  BytesKey<ConvCacheKey> cache_key;
-  cache_key.pod = {
+  ConvCacheKey cache_key{
      encoder.device().cuda_device(),
      dtype_to_cudnn_type(dtype),
      vector_key(in.shape()),
@@ -284,19 +297,16 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
      get_alignment(wt),
      get_alignment(out)};
  if (auto it = conv_cache().find(cache_key); it != conv_cache().end()) {
-    auto& [backend_type, graph] = it->second;
-    if (graph) {
-      // Run cached graph.
+    auto& [backend_type, plan] = it->second;
+    if (plan) {
+      // Run cached plan.
      std::tie(in, wt, out) =
          prepare_args(encoder, backend_type, in, wt, out, groups_, s);
      register_args(encoder, backend_type, in, wt, out, out_);
-      CHECK_CUDNN_FE_ERROR(graph->encode_capturing(
-          encoder,
-          {
-              {'x', gpu_ptr<void>(in)},
-              {'w', gpu_ptr<void>(wt)},
-              {'y', gpu_ptr<void>(out)},
-          }));
+      auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
+      if (!encode_cudnn_plan(encoder, *plan, {'x', 'w', 'y'}, x, w, y)) {
+        throw std::runtime_error("[conv] Cached plan failed to execute.");
+      }
    } else {
      // Run fallback kernel.
      gemm_conv(
@@ -317,7 +327,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {

  // There is no reliable way to deduce the proper cuDNN backend for the
  // convolution, so we make a best guess and then try.
-  SmallVector<ConvBackendType, 2> try_backends;
+  SmallVector<cudnnBackendDescriptorType_t, 2> try_backends;
  if (flip_) {
    // When weight is flipped, we assume it is backward input convolution.
    try_backends.push_back(CONV_BACKWARD_INPUT);
@@ -335,12 +345,13 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  }

  // Try to build op graph.
-  ConvBackendType backend_type;
-  std::optional<DnnGraph> graph;
+  cudnnBackendDescriptorType_t backend_type;
+  std::optional<cudnn_frontend::OperationGraph> op_graph;
  for (auto try_backend : try_backends) {
-    auto [x, w, y] =
+    auto [in_copy, wt_copy, out_copy] =
        prepare_args(encoder, try_backend, in, wt, out, groups_, s);
-    auto [stride, padding_lo, padding_hi, dilation] = get_conv_settings(
+    auto [x, w, y] = dispatch_args(try_backend, in_copy, wt_copy, out_copy);
+    auto [stride, padding_lo, padding_hi, dilation] = get_conv_op_settings(
        try_backend,
        x,
        w,
@@ -350,7 +361,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
        padding_hi_,
        kernel_dilation_,
        input_dilation_);
-    graph = build_conv_graph(
+    op_graph = build_conv_op_graph(
        encoder,
        try_backend,
        dtype,
@@ -361,27 +372,30 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
        padding_lo,
        padding_hi,
        dilation);
-    if (graph) {
+    if (op_graph) {
      backend_type = try_backend;
-      in = std::move(x);
-      wt = std::move(w);
-      out = std::move(y);
+      in = std::move(in_copy);
+      wt = std::move(wt_copy);
+      out = std::move(out_copy);
      break;
    }
  }

-  if (graph) {
-    register_args(encoder, backend_type, in, wt, out, out_);
-    CHECK_CUDNN_FE_ERROR(graph->encode_capturing(
-        encoder,
-        {
-            {'x', gpu_ptr<void>(in)},
-            {'w', gpu_ptr<void>(wt)},
-            {'y', gpu_ptr<void>(out)},
-        }));
-    conv_cache().emplace(
-        cache_key, std::make_pair(backend_type, std::move(*graph)));
-    return;
+  if (op_graph) {
+    // Find a plan for the graph and execute it.
+    auto plan = find_cudnn_plan_from_op_graph(
+        encoder.device().cudnn_handle(), backend_type, dtype, *op_graph);
+    if (plan) {
+      // Setup inputs and outputs.
+      register_args(encoder, backend_type, in, wt, out, out_);
+
+      auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
+      if (encode_cudnn_plan(encoder, *plan, {'x', 'w', 'y'}, x, w, y)) {
+        conv_cache().emplace(
+            cache_key, std::make_pair(backend_type, std::move(*plan)));
+        return;
+      }
+    }
  }

  // Use fallback kernel for settings not supported by cuDNN.
--- a/mlx/backend/cuda/conv/gemm_conv.cu
+++ b/mlx/backend/cuda/conv/gemm_conv.cu
@@ -86,7 +86,7 @@ array unfold_inputs_nd(
    int mat_N,
    ConvParams<NDIM>& params) {
  array unfolded({mat_M, mat_K}, in.dtype(), nullptr, {});
-  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
+  unfolded.set_data(allocator::malloc(unfolded.nbytes()));
  encoder.add_temporary(unfolded);

  int filter_size = params.C;
@@ -118,8 +118,8 @@ array unfold_inputs_nd(
        num_blocks,
        block_dims,
        0,
-        gpu_ptr<DataType>(in),
-        gpu_ptr<DataType>(unfolded),
+        in.data<DataType>(),
+        unfolded.data<DataType>(),
        filter_size,
        out_pixels,
        params);
--- a/mlx/backend/cuda/conv/gemm_grouped_conv.cu
+++ b/mlx/backend/cuda/conv/gemm_grouped_conv.cu
@@ -89,7 +89,7 @@ array grouped_unfold_transpose_inputs_nd(
    int mat_N,
    ConvParams<NDIM>& params) {
  array unfolded({mat_M, mat_K * params.groups}, in.dtype(), nullptr, {});
-  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
+  unfolded.set_data(allocator::malloc(unfolded.nbytes()));
  encoder.add_temporary(unfolded);

  int filter_size = params.C;
@@ -121,8 +121,8 @@ array grouped_unfold_transpose_inputs_nd(
        num_blocks,
        block_dims,
        0,
-        gpu_ptr<DataType>(in),
-        gpu_ptr<DataType>(unfolded),
+        in.data<DataType>(),
+        unfolded.data<DataType>(),
        filter_size,
        out_pixels,
        params);
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -5,21 +5,6 @@

 namespace mlx::core {

-void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
-  auto& encoder = cu::get_command_encoder(s);
-  bool donated = set_copy_output_data(
-      in, out, ctype, [&](auto n) { return cu::malloc_async(n, encoder); });
-  if (donated && in.dtype() == out.dtype()) {
-    // If the output has the same type as the input then there is nothing to
-    // copy, just use the buffer.
-    return;
-  }
-  if (ctype == CopyType::GeneralGeneral) {
-    ctype = CopyType::General;
-  }
-  copy_gpu_inplace(in, out, ctype, s);
-}
-
 void copy_gpu_inplace(
    const array& in,
    array& out,
@@ -102,31 +87,11 @@ void fill_gpu(const array& in, array& out, const Stream& s) {
  if (out.size() == 0) {
    return;
  }
+  out.set_data(allocator::malloc(out.nbytes()));
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
 }

-void reshape_gpu(const array& in, array& out, Stream s) {
-  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
-  if (copy_necessary) {
-    auto& encoder = cu::get_command_encoder(s);
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
-    copy_gpu_inplace(
-        in,
-        out,
-        in.shape(),
-        in.strides(),
-        make_contiguous_strides(in.shape()),
-        0,
-        0,
-        CopyType::General,
-        s);
-  } else {
-    shared_buffer_reshape(in, out_strides, out);
-  }
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -77,8 +77,8 @@ void copy_contiguous(
            num_blocks,
            block_dims,
            0,
-            gpu_ptr<InType>(in) + in_offset,
-            gpu_ptr<OutType>(out) + out_offset,
+            in.data<InType>() + in_offset,
+            out.data<OutType>() + out_offset,
            out.data_size());
      });
    });
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -106,8 +106,8 @@ void copy_general(
            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-            const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
-            OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
            int ndim = shape.size();
            size_t data_size = 1;
            for (auto& s : shape)
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -69,8 +69,8 @@ void copy_general_dynamic(
            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-            const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
-            OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
            int ndim = shape.size();
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
@@ -90,8 +90,8 @@ void copy_general_dynamic(
                    const_param<dims_constant()>(shape),
                    const_param<dims_constant()>(strides_in),
                    const_param<dims_constant()>(strides_out),
-                    gpu_ptr<int64_t>(dynamic_offset_in),
-                    gpu_ptr<int64_t>(dynamic_offset_out));
+                    dynamic_offset_in.data<int64_t>(),
+                    dynamic_offset_out.data<int64_t>());
              });
            } else { // ndim >= 4
              auto [num_blocks, block_dims] = get_launch_args(out, large());
@@ -107,8 +107,8 @@ void copy_general_dynamic(
                  const_param(strides_in),
                  const_param(strides_out),
                  ndim,
-                  gpu_ptr<int64_t>(dynamic_offset_in),
-                  gpu_ptr<int64_t>(dynamic_offset_out));
+                  dynamic_offset_in.data<int64_t>(),
+                  dynamic_offset_out.data<int64_t>());
            }
          });
    });
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -92,17 +92,14 @@ void copy_general_input(
            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-            const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
-            OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
            int ndim = shape.size();
-
-            int work_per_thread = 8;
+            int work_per_thread = 1;
            auto dim0 = ndim > 0 ? shape.back() : 1;
            auto rest = out.size() / dim0;
-            if (dim0 >= 4 && dim0 < 8) {
+            if (dim0 >= 4) {
              work_per_thread = 4;
-            } else if (dim0 < 4) {
-              work_per_thread = 1;
            }
            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
            auto block_dims = get_block_dims(dim0, rest, 1);
@@ -113,10 +110,7 @@ void copy_general_input(
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
                auto kernel =
                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 1>;
-                if (work_per_thread == 8) {
-                  kernel =
-                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 8>;
-                } else if (work_per_thread == 4) {
+                if (work_per_thread == 4) {
                  kernel =
                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 4>;
                }
@@ -133,9 +127,7 @@ void copy_general_input(
              });
            } else { // ndim >= 4
              auto kernel = cu::copy_g<InType, OutType, IdxT, 1>;
-              if (work_per_thread == 8) {
-                kernel = cu::copy_g<InType, OutType, IdxT, 8>;
-              } else if (work_per_thread == 4) {
+              if (work_per_thread == 4) {
                kernel = cu::copy_g<InType, OutType, IdxT, 4>;
              }
              encoder.add_kernel_node(
--- a/mlx/backend/cuda/cuda_utils.h
+++ b/mlx/backend/cuda/cuda_utils.h
@@ -1,89 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include <cublasLt.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cudnn.h>
-
-namespace mlx::core {
-
-// Throw exception if the cuda API does not succeed.
-void check_cublas_error(const char* name, cublasStatus_t err);
-void check_cuda_error(const char* name, cudaError_t err);
-void check_cuda_error(const char* name, CUresult err);
-void check_cudnn_error(const char* name, cudnnStatus_t err);
-
-// The macro version that prints the command that failed.
-#define CHECK_CUBLAS_ERROR(cmd) check_cublas_error(#cmd, (cmd))
-#define CHECK_CUDA_ERROR(cmd) check_cuda_error(#cmd, (cmd))
-#define CHECK_CUDNN_ERROR(cmd) check_cudnn_error(#cmd, (cmd))
-
-// Base class for RAII managed CUDA resources.
-template <typename Handle, cudaError_t (*Destroy)(Handle)>
-class CudaHandle {
- public:
-  CudaHandle(Handle handle = nullptr) : handle_(handle) {}
-
-  CudaHandle(CudaHandle&& other) : handle_(other.handle_) {
-    assert(this != &other);
-    other.handle_ = nullptr;
-  }
-
-  ~CudaHandle() {
-    // Skip if there was an error to avoid throwing in the destructors
-    if (cudaPeekAtLastError() != cudaSuccess) {
-      return;
-    }
-    reset();
-  }
-
-  CudaHandle(const CudaHandle&) = delete;
-  CudaHandle& operator=(const CudaHandle&) = delete;
-
-  CudaHandle& operator=(CudaHandle&& other) {
-    assert(this != &other);
-    reset();
-    std::swap(handle_, other.handle_);
-    return *this;
-  }
-
-  void reset() {
-    if (handle_ != nullptr) {
-      CHECK_CUDA_ERROR(Destroy(handle_));
-      handle_ = nullptr;
-    }
-  }
-
-  operator Handle() const {
-    return handle_;
-  }
-
- protected:
-  Handle handle_;
-};
-
-namespace cu {
-class Device;
-}; // namespace cu
-
-// Wrappers of CUDA resources.
-class CudaGraph : public CudaHandle<cudaGraph_t, cudaGraphDestroy> {
- public:
-  using CudaHandle::CudaHandle;
-  explicit CudaGraph(cu::Device& device);
-  void end_capture(cudaStream_t stream);
-};
-
-class CudaGraphExec : public CudaHandle<cudaGraphExec_t, cudaGraphExecDestroy> {
- public:
-  void instantiate(cudaGraph_t graph);
-};
-
-class CudaStream : public CudaHandle<cudaStream_t, cudaStreamDestroy> {
- public:
-  explicit CudaStream(cu::Device& device);
-};
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/cudnn_utils.cpp
+++ b/mlx/backend/cuda/cudnn_utils.cpp
@@ -7,26 +7,32 @@ namespace mlx::core {

 namespace {

-#define RETURN_IF_ERROR(cmd)          \
-  if (auto ret = cmd; ret.is_bad()) { \
-    return ret;                       \
-  }
+// Create a cudnn tensor descriptor.
+template <typename Vec>
+inline cudnn_frontend::Tensor build_cudnn_tensor(
+    int64_t id,
+    const array& x,
+    const Vec& shape,
+    const Vec& strides) {
+  return cudnn_frontend::TensorBuilder()
+      .setDim(shape.size(), shape.data())
+      .setStrides(strides.size(), strides.data())
+      .setId(id)
+      .setAlignment(get_alignment(x))
+      .setDataType(dtype_to_cudnn_type(x.dtype()))
+      .build();
+}

 // In MLX a singleton dim (shape[dim] == 1) can have any stride, but in cuDNN
 // whether a tensor is contiguous is determined with:
 // shape[dim] == shape[dim + 1] * strides[dim + 1]
 // So a contiguous array with singleton dims in MLX may be mistakenly treated
 // as strided in cuDNN, and we work around it by normalizing the strides.
-std::vector<int64_t> normalized_strides(const array& x) {
-  std::vector<int64_t> strides(x.strides().begin(), x.strides().end());
-  if (std::all_of(
-          strides.begin(), strides.end(), [](int64_t s) { return s == 0; })) {
-    strides.back() = 1;
-    return strides;
-  }
+Strides normalized_strides(const array& x) {
  if (!x.flags().row_contiguous || x.ndim() < 2) {
-    return strides;
+    return x.strides();
  }
+  Strides strides = x.strides();
  for (int i = x.ndim() - 2; i >= 0; --i) {
    if (x.shape(i) == 1) {
      strides[i] = x.shape(i + 1) * strides[i + 1];
@@ -36,9 +42,7 @@ std::vector<int64_t> normalized_strides(const array& x) {
 }

 // Return the shape and strides after transposing from NHWC to NCHW.
-inline auto nhwc_to_nchw(const array& x) {
-  auto shape = convert_vector<int64_t>(x.shape());
-  auto strides = normalized_strides(x);
+auto nhwc_to_nchw(SmallVector<int64_t> shape, SmallVector<int64_t> strides) {
  assert(shape.size() >= 3);
  shape.insert(shape.begin() + 1, shape.back());
  shape.erase(shape.end() - 1);
@@ -47,95 +51,225 @@ inline auto nhwc_to_nchw(const array& x) {
  return std::make_tuple(std::move(shape), std::move(strides));
 }

+inline auto nhwc_to_nchw(const array& x) {
+  return nhwc_to_nchw(
+      convert_vector<int64_t>(x.shape()), normalized_strides(x));
+}
+
+// Return available engines for a |op_graph|.
+cudnn_frontend::EngineConfigList get_cudnn_engine_configs(
+    cudnnBackendDescriptorType_t backend_type,
+    Dtype dtype,
+    cudnn_frontend::OperationGraph& op_graph,
+    bool use_fallback = true) {
+  SmallVector<cudnn_frontend::GeneratorSource, 2> sources;
+  sources.push_back([](auto& op_graph) {
+    auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+                          .setOperationGraph(op_graph)
+                          .setHeurMode(CUDNN_HEUR_MODE_A)
+                          .build();
+    return heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+  });
+  if (use_fallback) {
+    sources.push_back([&backend_type](auto& op_graph) {
+      auto fallback = cudnn_frontend::EngineFallbackListBuilder()
+                          .setOperationGraph(op_graph)
+                          .setOperation(backend_type)
+                          .build();
+      return fallback.getFallbackList();
+    });
+  }
+
+  auto configs =
+      cudnn_frontend::EngineConfigGenerator(sources.size(), sources.data())
+          .generate_engine_config(op_graph);
+
+  cudnn_frontend::EngineConfigList filtered_configs;
+  cudnn_frontend::filter(configs, filtered_configs, [dtype](auto c) {
+    if (cudnn_frontend::hasNumericalNote<
+            CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) {
+      return true;
+    }
+    if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(c) &&
+        dtype == float32 && !env::enable_tf32()) {
+      return true;
+    }
+    return false;
+  });
+  return filtered_configs;
+}
+
+// Take |engine_configs| and |op_graph| and find a working execution plans
+// from them.
+std::optional<cudnn_frontend::ExecutionPlan>
+find_cudnn_plan_from_engine_configs(
+    cudnnHandle_t handle,
+    const cudnn_frontend::EngineConfigList& engine_configs,
+    const cudnn_frontend::OperationGraph& op_graph) {
+  auto op_graph_tag = op_graph.getTag();
+  for (const auto& config : engine_configs) {
+    try {
+      return cudnn_frontend::ExecutionPlanBuilder()
+          .setHandle(handle)
+          .setEngineConfig(config, op_graph_tag)
+          .build();
+    } catch (cudnn_frontend::cudnnException& error) {
+      if (error.getCudnnStatus() != CUDNN_STATUS_NOT_SUPPORTED) {
+        throw;
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+// Prepare workspace and args to execute plan.
+template <typename F>
+bool prepare_cudnn_plan(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    int num_args,
+    const int64_t* uids,
+    void** data_ptrs,
+    F&& execute) {
+  int workspace_size = plan.getWorkspaceSize();
+  array workspace(
+      workspace_size > 0 ? allocator::malloc(workspace_size)
+                         : allocator::Buffer(nullptr),
+      {workspace_size},
+      uint8);
+
+  auto args = cudnn_frontend::VariantPackBuilder()
+                  .setWorkspacePointer(workspace.data<void>())
+                  .setDataPointers(num_args, data_ptrs)
+                  .setUids(num_args, uids)
+                  .build();
+
+  auto handle = encoder.device().cudnn_handle();
+  cudnnSetStream(handle, encoder.stream());
+
+  if (!execute(handle, plan.get_raw_desc(), args.get_raw_desc())) {
+    return false;
+  }
+
+  encoder.add_temporary(workspace);
+  return true;
+}
+
 } // namespace

-fe::error_t DnnGraph::prepare() {
-  RETURN_IF_ERROR(validate());
-  try {
-    RETURN_IF_ERROR(build_operation_graph(handle_));
-  } catch (cudnn_frontend::cudnnException& error) {
-    // cuDNN bug: they did not catch all exceptions in the API.
-    return {fe::error_code_t::CUDNN_BACKEND_API_FAILED, error.what()};
-  }
-  RETURN_IF_ERROR(create_execution_plans({fe::HeurMode_t::A}));
-  return {};
+cudnn_frontend::Tensor build_cudnn_tensor(int64_t id, const array& x) {
+  auto shape = convert_vector<int64_t>(x.shape());
+  return build_cudnn_tensor(id, x, shape, normalized_strides(x));
 }

-fe::error_t DnnGraph::build() {
-  RETURN_IF_ERROR(check_support(handle_));
-  RETURN_IF_ERROR(build_plans(handle_));
-  return {};
-}
-
-fe::error_t DnnGraph::encode_graph(
-    cu::CommandEncoder& encoder,
-    std::unordered_map<int64_t, void*> variant_pack) {
-  cudnnSetStream(handle_, encoder.stream());
-  CudaGraph cuda_graph(encoder.device());
-  RETURN_IF_ERROR(populate_cuda_graph(
-      handle_, variant_pack, prepare_workspace(encoder), cuda_graph));
-  encoder.add_graph_node(cuda_graph);
-  return {};
-}
-
-fe::error_t DnnGraph::encode_capturing(
-    cu::CommandEncoder& encoder,
-    std::unordered_map<int64_t, void*> variant_pack) {
-  auto* workspace_ptr = prepare_workspace(encoder);
-  auto capture = encoder.capture_context();
-  cudnnSetStream(handle_, encoder.stream());
-  auto ret = execute(handle_, variant_pack, workspace_ptr);
-  if (ret.is_bad()) {
-    capture.discard = true;
-  }
-  return ret;
-}
-
-void* DnnGraph::prepare_workspace(cu::CommandEncoder& encoder) {
-  int64_t workspace_size = 0;
-  CHECK_CUDNN_FE_ERROR(get_workspace_size(workspace_size));
-  if (workspace_size > 0) {
-    array workspace(
-        cu::malloc_async(workspace_size, encoder),
-        {static_cast<int>(workspace_size)},
-        uint8);
-    encoder.add_temporary(workspace);
-    return gpu_ptr<void>(workspace);
-  }
-  return nullptr;
-}
-
-void DnnGraph::set_tensor_attrs(
-    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
-    int64_t uid,
-    const array& x,
-    const std::vector<int64_t>& shape,
-    const std::vector<int64_t>& strides) {
-  tensor->set_uid(uid)
-      .set_alignment(get_alignment(x))
-      .set_data_type(dtype_to_cudnn_type(x.dtype()))
-      .set_dim(shape)
-      .set_stride(strides);
-}
-
-void DnnGraph::set_tensor_attrs(
-    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
-    int64_t uid,
-    const array& x) {
-  set_tensor_attrs(
-      tensor,
-      uid,
-      x,
-      convert_vector<int64_t>(x.shape()),
-      normalized_strides(x));
-}
-
-void DnnGraph::set_tensor_attrs_nchw(
-    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
-    int64_t uid,
-    const array& x) {
+cudnn_frontend::Tensor build_cudnn_tensor_nchw(int64_t id, const array& x) {
  auto [shape, strides] = nhwc_to_nchw(x);
-  set_tensor_attrs(tensor, uid, x, shape, strides);
+  return build_cudnn_tensor(id, x, shape, strides);
 }

+cudnn_frontend::Tensor build_cudnn_tensor_4d_nchw(int64_t id, const array& x) {
+  if (x.ndim() == 0) {
+    SmallVector<int64_t, 4> scalar_dims = {1, 1, 1, 1};
+    return build_cudnn_tensor(id, x, scalar_dims, scalar_dims);
+  }
+  if (x.ndim() == 1) {
+    int64_t s = x.shape(0);
+    SmallVector<int64_t, 4> shape = {1, x.shape(0), 1, 1};
+    SmallVector<int64_t, 4> strides = {s, 1, s, s};
+    return build_cudnn_tensor(id, x, shape, strides);
+  }
+  if (x.ndim() == 2) {
+    int64_t s =
+        x.flags().row_contiguous ? x.shape(1) * x.strides(1) : x.strides(0);
+    SmallVector<int64_t, 4> shape = {x.shape(0), x.shape(1), 1, 1};
+    SmallVector<int64_t, 4> strides = {s, x.strides(1), s, s};
+    return build_cudnn_tensor(id, x, shape, strides);
+  }
+  if (x.ndim() == 3 || x.ndim() == 4) {
+    return build_cudnn_tensor_nchw(id, x);
+  }
+  throw std::runtime_error(
+      fmt::format("Unsupported array with {} dims.", x.ndim()));
+}
+
+cudnn_frontend::Tensor build_cudnn_scalar_4d(int64_t id, Dtype dtype) {
+  SmallVector<int64_t, 4> scalar_dims = {1, 1, 1, 1};
+  return cudnn_frontend::TensorBuilder()
+      .setDim(scalar_dims.size(), scalar_dims.data())
+      .setStrides(scalar_dims.size(), scalar_dims.data())
+      .setId(id)
+      .setAlignment(16)
+      .setDataType(dtype_to_cudnn_type(dtype))
+      .setByValue(true)
+      .build();
+}
+
+std::optional<cudnn_frontend::ExecutionPlan> find_cudnn_plan_from_op_graph(
+    cudnnHandle_t handle,
+    cudnnBackendDescriptorType_t backend_type,
+    Dtype dtype,
+    cudnn_frontend::OperationGraph& op_graph) {
+  auto engine_configs = get_cudnn_engine_configs(backend_type, dtype, op_graph);
+  if (engine_configs.empty()) {
+    return std::nullopt;
+  }
+  return find_cudnn_plan_from_engine_configs(handle, engine_configs, op_graph);
+}
+
+bool encode_cudnn_plan_with_capturing(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    int num_args,
+    const int64_t* uids,
+    void** data_ptrs) {
+  return prepare_cudnn_plan(
+      encoder,
+      plan,
+      num_args,
+      uids,
+      data_ptrs,
+      [&](auto handle, auto plan, auto args) {
+        auto capture = encoder.capture_context();
+        if (cudnnBackendExecute(handle, plan, args) != CUDNN_STATUS_SUCCESS) {
+          // Discard the captured graph when failed.
+          capture.discard = true;
+          return false;
+        }
+        return true;
+      });
+}
+
+#if CUDNN_VERSION >= 90500
+bool encode_cudnn_plan_with_graph_api(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    CudaGraph& graph,
+    int num_args,
+    const int64_t* uids,
+    void** data_ptrs) {
+  return prepare_cudnn_plan(
+      encoder,
+      plan,
+      num_args,
+      uids,
+      data_ptrs,
+      [&](auto handle, auto plan, auto args) {
+        if (!graph) {
+          graph = CudaGraph(encoder.device());
+          if (cudnnBackendPopulateCudaGraph(handle, plan, args, graph) !=
+              CUDNN_STATUS_SUCCESS) {
+            return false;
+          }
+        } else {
+          if (cudnnBackendUpdateCudaGraph(handle, plan, args, graph) !=
+              CUDNN_STATUS_SUCCESS) {
+            return false;
+          }
+        }
+        encoder.add_graph_node(graph);
+        return true;
+      });
+}
+#endif
+
 } // namespace mlx::core
--- a/mlx/backend/cuda/cudnn_utils.h
+++ b/mlx/backend/cuda/cudnn_utils.h
@@ -2,34 +2,28 @@

 #pragma once

+#include "mlx/array.h"
 #include "mlx/backend/cuda/device/config.h"
 #include "mlx/backend/cuda/utils.h"
 #include "mlx/dtype_utils.h"

 #include <cudnn_frontend.h>
+#include <cudnn_frontend_find_plan.h>
 #include <fmt/format.h>

+#include <algorithm>
+#include <array>
+
 namespace mlx::core {

 namespace cu {
 class CommandEncoder;
 }

-namespace fe = cudnn_frontend;
-
-#define CHECK_CUDNN_FE_ERROR(cmd)                                    \
-  do {                                                               \
-    auto error = cmd;                                                \
-    if (!error.is_good()) {                                          \
-      throw std::runtime_error(                                      \
-          fmt::format("{} failed: {}.", #cmd, error.get_message())); \
-    }                                                                \
-  } while (0)
-
 // Return pointer alignment of |x|'s data.
 inline uint8_t get_alignment(const array& x) {
  uint8_t alignment = 1;
-  uintptr_t address = reinterpret_cast<uintptr_t>(gpu_ptr<void>(x));
+  uintptr_t address = reinterpret_cast<uintptr_t>(x.data<void>());
  for (; alignment < 32; alignment *= 2) {
    if (address % (alignment * 2)) {
      return alignment;
@@ -40,31 +34,8 @@ inline uint8_t get_alignment(const array& x) {

 // Convert the type of elements in |vec| to |T|.
 template <typename T, typename Vec>
-inline std::vector<T> convert_vector(const Vec& vec) {
-  return std::vector<T>(vec.begin(), vec.end());
-}
-
-// Map dtype to cudnn data type.
-inline fe::DataType_t dtype_to_cudnn_type(Dtype dtype) {
-  switch (dtype) {
-    case int8:
-      return fe::DataType_t::INT8;
-    case int32:
-      return fe::DataType_t::INT32;
-    case uint8:
-      return fe::DataType_t::UINT8;
-    case float16:
-      return fe::DataType_t::HALF;
-    case bfloat16:
-      return fe::DataType_t::BFLOAT16;
-    case float32:
-      return fe::DataType_t::FLOAT;
-    case float64:
-      return fe::DataType_t::DOUBLE;
-    default:
-      throw std::runtime_error(fmt::format(
-          "Unsupported dtype in cuDNN: {}.", dtype_to_string(dtype)));
-  }
+inline SmallVector<T> convert_vector(const Vec& vec) {
+  return SmallVector<T>(vec.begin(), vec.end());
 }

 // Return an array that can be used as map key for |vec| with size <= MAX_NDIM.
@@ -72,100 +43,122 @@ inline fe::DataType_t dtype_to_cudnn_type(Dtype dtype) {
 // There are 2 differences from the const_param util from kernel_utils.cuh:
 // 1. The rest of array is filled with 0.
 // 2. This util can be used in .cpp files.
-template <int NDIM = MAX_NDIM, typename T, template <typename U> class Vec>
-inline std::array<T, NDIM> vector_key(const Vec<T>& vec) {
-  if (vec.size() > NDIM) {
+template <typename T, template <typename U> class Vec>
+inline std::array<T, MAX_NDIM> vector_key(const Vec<T>& vec) {
+  if (vec.size() > MAX_NDIM) {
    throw std::runtime_error(
-        fmt::format("ndim can not be larger than {}.", NDIM));
+        fmt::format("ndim can not be larger than {}.", MAX_NDIM));
  }
-  std::array<T, NDIM> result = {};
+  std::array<T, MAX_NDIM> result = {};
  std::copy_n(vec.begin(), vec.size(), result.begin());
  return result;
 }

-// Extends cuDNN graph with helpers.
-class DnnGraph : public fe::graph::Graph {
- public:
-  DnnGraph(cudnnHandle_t handle, Dtype io_dtype, Dtype compute_dtype = float32)
-      : handle_(handle) {
-    set_io_data_type(dtype_to_cudnn_type(io_dtype));
-    set_intermediate_data_type(dtype_to_cudnn_type(compute_dtype));
-    set_compute_data_type(dtype_to_cudnn_type(compute_dtype));
+// Helpers used by get_data_ptrs to get pointers.
+inline void* get_data_ptr(const array& arr) {
+  return const_cast<void*>(arr.data<void>());
+}
+
+template <typename T, typename = std::enable_if_t<std::is_scalar_v<T>>>
+inline void* get_data_ptr(T& scalar) {
+  return &scalar;
+}
+
+// Return an array filled with data pointers of args.
+template <typename... Args>
+inline std::array<void*, sizeof...(Args)> get_data_ptrs(Args&... args) {
+  return {get_data_ptr(args)...};
+}
+
+// Map dtype to cudnn data type.
+inline cudnnDataType_t dtype_to_cudnn_type(Dtype dtype) {
+  switch (dtype) {
+    case int8:
+      return CUDNN_DATA_INT8;
+    case int32:
+      return CUDNN_DATA_INT32;
+    case uint8:
+      return CUDNN_DATA_UINT8;
+    case float16:
+      return CUDNN_DATA_HALF;
+    case bfloat16:
+      return CUDNN_DATA_BFLOAT16;
+    case float32:
+      return CUDNN_DATA_FLOAT;
+    case float64:
+      return CUDNN_DATA_DOUBLE;
+    default:
+      throw std::runtime_error(fmt::format(
+          "Unsupported dtype in Convolution: {}.", dtype_to_string(dtype)));
  }
+}

-  // Create a cuDNN tensor description from MLX array |x|.
-  auto& tensor(
-      std::shared_ptr<fe::graph::Tensor_attributes>& attrs,
-      int64_t uid,
-      const array& x) {
-    set_tensor_attrs(attrs, uid, x);
-    return attrs;
-  }
-  auto tensor(const char* name, int64_t uid, const array& x) {
-    auto attrs = Graph::tensor(fe::graph::Tensor_attributes().set_name(name));
-    tensor(attrs, uid, x);
-    return attrs;
-  }
+// Create a tensor descriptor from |x|.
+cudnn_frontend::Tensor build_cudnn_tensor(int64_t id, const array& x);

-  // Create a cuDNN tensor description from MLX array |x|, and transpose it from
-  // NHWC layout to NCHW.
-  auto& tensor_nchw(
-      std::shared_ptr<fe::graph::Tensor_attributes>& attrs,
-      int64_t uid,
-      const array& x) {
-    set_tensor_attrs_nchw(attrs, uid, x);
-    return attrs;
-  }
-  auto tensor_nchw(const char* name, int64_t uid, const array& x) {
-    auto attrs = Graph::tensor(fe::graph::Tensor_attributes().set_name(name));
-    tensor_nchw(attrs, uid, x);
-    return attrs;
-  }
+// Create a tensor descriptor from |x|, and transpose from NHWC to NCHW.
+cudnn_frontend::Tensor build_cudnn_tensor_nchw(int64_t id, const array& x);

-  // Create a cuDNN tensor for scalar.
-  auto scalar(const char* name, int64_t uid, Dtype dtype) {
-    return Graph::tensor(fe::graph::Tensor_attributes()
-                             .set_name(name)
-                             .set_uid(uid)
-                             .set_dim({1, 1, 1, 1})
-                             .set_stride({1, 1, 1, 1})
-                             .set_is_pass_by_value(true)
-                             .set_data_type(dtype_to_cudnn_type(dtype)));
-  }
+// Create a tensor descriptor from |x|, make sure it is 4D, and transpose it
+// from NHWC to NCHW.
+cudnn_frontend::Tensor build_cudnn_tensor_4d_nchw(int64_t id, const array& x);

-  // Call this before setting notes.
-  fe::error_t prepare();
-  // Call this after setting notes.
-  fe::error_t build();
+// Create a 4D scalar tensor descriptor, which is passed by value.
+cudnn_frontend::Tensor build_cudnn_scalar_4d(int64_t id, Dtype dtype);

-  // Add cuDNN graph to CUDA graph, using native CUDA graph API.
-  fe::error_t encode_graph(
-      cu::CommandEncoder& encoder,
-      std::unordered_map<int64_t, void*> variant_pack);
-  // Add cuDNN graph to CUDA graph, using stream capture.
-  fe::error_t encode_capturing(
-      cu::CommandEncoder& encoder,
-      std::unordered_map<int64_t, void*> variant_pack);
+// Find a working plan for |op_graph|.
+std::optional<cudnn_frontend::ExecutionPlan> find_cudnn_plan_from_op_graph(
+    cudnnHandle_t handle,
+    cudnnBackendDescriptorType_t backend_type,
+    Dtype dtype,
+    cudnn_frontend::OperationGraph& op_graph);

- private:
-  void* prepare_workspace(cu::CommandEncoder& encoder);
+// Encode the plan to command buffer by capturing.
+bool encode_cudnn_plan_with_capturing(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    int num_args,
+    const int64_t* uids,
+    void** data_ptrs);

-  void set_tensor_attrs(
-      std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
-      int64_t uid,
-      const array& x,
-      const std::vector<int64_t>& shape,
-      const std::vector<int64_t>& strides);
-  void set_tensor_attrs(
-      std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
-      int64_t uid,
-      const array& x);
-  void set_tensor_attrs_nchw(
-      std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
-      int64_t uid,
-      const array& x);
+#if CUDNN_VERSION >= 90500
+// Encode the plan to command buffer by using native graph api of cudnn. If the
+// |graph| is empty it will be populated, otherwise it will be updated.
+bool encode_cudnn_plan_with_graph_api(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    CudaGraph& graph,
+    int num_args,
+    const int64_t* uids,
+    void** data_ptrs);
+#endif

-  cudnnHandle_t handle_;
-};
+// Helpers to make calls like encode_cudnn_plan(..., {'x', 'y', 'z'}, x, y, z).
+template <typename... Args>
+bool encode_cudnn_plan(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    std::initializer_list<int64_t> uids,
+    Args&... args) {
+  assert(uids.size() == sizeof...(args));
+  auto data_ptrs = get_data_ptrs(args...);
+  return encode_cudnn_plan_with_capturing(
+      encoder, plan, uids.size(), uids.begin(), data_ptrs.data());
+}
+
+#if CUDNN_VERSION >= 90500
+template <typename... Args>
+bool encode_cudnn_plan(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    CudaGraph& graph,
+    std::initializer_list<int64_t> uids,
+    Args&... args) {
+  assert(uids.size() == sizeof...(args));
+  auto data_ptrs = get_data_ptrs(args...);
+  return encode_cudnn_plan_with_graph_api(
+      encoder, plan, graph, uids.size(), uids.begin(), data_ptrs.data());
+}
+#endif

 } // namespace mlx::core
--- a/mlx/backend/cuda/custom_kernel.cpp
+++ b/mlx/backend/cuda/custom_kernel.cpp
@@ -57,7 +57,7 @@ std::string build_kernel(
    const std::vector<std::string>& output_names,
    const std::vector<Dtype>& output_dtypes,
    const std::vector<std::pair<std::string, TemplateArg>>& template_args,
-    const std::vector<std::tuple<bool, bool, bool>>& shape_infos) {
+    const std::vector<CustomKernelShapeInfo>& shape_infos) {
  std::string kernel_source;
  kernel_source.reserve(header.size() + source.size() + 8192);
  kernel_source += default_header;
@@ -81,17 +81,17 @@ std::string build_kernel(
    kernel_source += ",\n";
    // Add input shape, strides and ndim if present in the source
    if (arr.ndim() > 0) {
-      if (std::get<0>(shape_infos[i])) {
+      if (shape_infos[i].shape) {
        kernel_source += "    const __grid_constant__ Shape ";
        kernel_source += name;
        kernel_source += "_shape,\n";
      }
-      if (std::get<1>(shape_infos[i])) {
+      if (shape_infos[i].strides) {
        kernel_source += "    const __grid_constant__ Strides ";
        kernel_source += name;
        kernel_source += "_strides,\n";
      }
-      if (std::get<2>(shape_infos[i])) {
+      if (shape_infos[i].ndim) {
        kernel_source += "    const __grid_constant__ int ";
        kernel_source += name;
        kernel_source += "_ndim,\n";
@@ -154,12 +154,12 @@ CustomKernelFunction cuda_kernel(
        "[custom_kernel] Must specify at least one output.");
  }

-  std::vector<std::tuple<bool, bool, bool>> shape_infos;
+  std::vector<CustomKernelShapeInfo> shape_infos;
  for (auto& n : input_names) {
-    std::tuple<bool, bool, bool> shape_info;
-    std::get<0>(shape_info) = source.find(n + "_shape") != std::string::npos;
-    std::get<1>(shape_info) = source.find(n + "_strides") != std::string::npos;
-    std::get<2>(shape_info) = source.find(n + "_ndim") != std::string::npos;
+    CustomKernelShapeInfo shape_info;
+    shape_info.shape = source.find(n + "_shape") != std::string::npos;
+    shape_info.strides = source.find(n + "_strides") != std::string::npos;
+    shape_info.ndim = source.find(n + "_ndim") != std::string::npos;
    shape_infos.push_back(shape_info);
  }

@@ -254,8 +254,8 @@ std::vector<array> precompiled_cuda_kernel(
    std::optional<float> init_value,
    bool ensure_row_contiguous,
    StreamOrDevice s) {
-  std::vector<std::tuple<bool, bool, bool>> shape_infos(
-      inputs.size(), {false, false, false});
+  std::vector<CustomKernelShapeInfo> shape_infos(
+      inputs.size(), CustomKernelShapeInfo{false, false, false});
  return array::make_arrays(
      output_shapes,
      output_dtypes,
@@ -279,7 +279,6 @@ void CustomKernel::eval_gpu(
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("CustomKernel::eval_gpu");
  auto& s = stream();
-  auto& encoder = cu::get_command_encoder(s);

  std::vector<array> copies;

@@ -289,7 +288,7 @@ void CustomKernel::eval_gpu(
      copies.emplace_back(init_value_.value(), out.dtype());
      fill_gpu(copies.back(), out, s);
    } else {
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(allocator::malloc(out.nbytes()));
    }
  }

@@ -327,13 +326,13 @@ void CustomKernel::eval_gpu(
    const array& in = checked_inputs[i];
    auto& shape_info = shape_infos_[i];
    args.append(in);
-    if (std::get<0>(shape_info)) {
+    if (shape_info.shape) {
      args.append_ndim(in.shape());
    }
-    if (std::get<1>(shape_info)) {
+    if (shape_info.strides) {
      args.append_ndim(in.strides());
    }
-    if (std::get<2>(shape_info)) {
+    if (shape_info.ndim) {
      args.append<int32_t>(in.ndim());
    }
  }
@@ -357,6 +356,7 @@ void CustomKernel::eval_gpu(
  dim3 grid((gx + tx - 1) / tx, (gy + ty - 1) / ty, (gz + tz - 1) / tz);

  // Call the kernel
+  auto& encoder = cu::get_command_encoder(s);
  for (const auto& in : checked_inputs) {
    encoder.set_input_array(in);
  }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ronan Collobert	24828b1b2f	CMakeLists.txt update	2025-10-31 16:55:04 -07:00
Ronan Collobert	9f649b5658	WIP (python)	2025-10-31 16:24:51 -07:00
Ronan Collobert	18aa921388	WIP	2025-10-31 16:24:35 -07:00
Ronan Collobert	8d13a0bc6b	WIP (metal)	2025-10-31 16:24:21 -07:00
Ronan Collobert	ac75c87fd7	WIP (cpu)	2025-10-31 16:24:09 -07:00
Ronan Collobert	7107802e09	WIP (examples)	2025-10-31 16:23:51 -07:00
Ronan Collobert	c5913131cf	WIP (distributed)	2025-10-31 13:32:56 -07:00
Ronan Collobert	19ab7911f6	WIP (cuda)	2025-10-31 13:32:43 -07:00
Ronan Collobert	4a1b1796b7	WIP (io)	2025-10-31 13:20:47 -07:00
Ronan Collobert	b48d298205	WIP (distributed)	2025-10-31 13:20:09 -07:00
Ronan Collobert	8277e71ea9	WIP (gpu)	2025-10-31 13:19:54 -07:00
Ronan Collobert	b0d985416a	fix arg_reduce	2025-10-31 13:13:15 -07:00
Ronan Collobert	8d10f3ec75	WIP (metal)	2025-10-31 11:47:03 -07:00
Ronan Collobert	6343622c67	fix small vector indexing checks	2025-10-31 11:46:36 -07:00
Ronan Collobert	979abf462b	WIP (metal)	2025-10-31 09:43:29 -07:00
Ronan Collobert	981d2fdaf0	WIP (cpu)	2025-10-31 09:40:50 -07:00
Ronan Collobert	5a306d3495	WIP (common)	2025-10-31 09:40:13 -07:00
Ronan Collobert	5baa361779	WIP (tests)	2025-10-31 09:39:38 -07:00
Ronan Collobert	1bac0db7e3	WIP	2025-10-30 16:25:36 -07:00
Ronan Collobert	a1212b4e44	WIP (distributed)	2025-10-30 16:25:11 -07:00
Ronan Collobert	45a8b226af	WIP (cpu)	2025-10-30 16:24:51 -07:00
Ronan Collobert	76ef1e98f3	WIP (common)	2025-10-30 16:18:59 -07:00
Ronan Collobert	63d91557e0	fix FFT (PocketFFT requires size_t for axis)	2025-10-29 17:05:48 -07:00
Ronan Collobert	310e501e6a	WIP (cpu)	2025-10-29 16:52:25 -07:00
Ronan Collobert	cacc3ab7fd	WIP (common)	2025-10-29 16:51:42 -07:00
Ronan Collobert	53525cba23	WIP	2025-10-29 16:51:05 -07:00
Ronan Collobert	3d67b717a0	the cpu simd case	2025-10-29 16:43:18 -07:00
Ronan Collobert	953b2f5be2	WIP	2025-10-29 16:11:32 -07:00
Ronan Collobert	26f7155537	SmallVector: keep sizes small (int)	2025-10-29 16:06:10 -07:00
Ronan Collobert	66fcb9fe94	array: use int or int64_t instead of size_t	2025-10-29 16:04:04 -07:00