Add stricter condition to matrix sdpa

Fix cudnn routing
Update routing
2025-09-06 00:20:45 +08:00 · 2025-08-06 19:51:14 -07:00 · 2025-08-06 15:05:58 -07:00 · 2025-08-06 15:01:15 -07:00 · 2025-08-06 13:57:40 -07:00 · 2025-08-06 09:56:39 -07:00
466 changed files with 39523 additions and 11098 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7,15 +7,9 @@ parameters:
  nightly_build:
    type: boolean
    default: false
-  weekly_build:
-    type: boolean
-    default: false
  test_release:
    type: boolean
    default: false
-  linux_release:
-    type: boolean
-    default: false

 jobs:
  build_documentation:
@@ -24,8 +18,8 @@ jobs:
        type: boolean
        default: false
    macos:
-      xcode: "15.2.0"
-    resource_class: macos.m1.medium.gen1
+      xcode: "16.2.0"
+    resource_class: m2pro.medium
    steps:
      - checkout
      - run:
@@ -38,7 +32,7 @@ jobs:
            pip install --upgrade pip
            pip install --upgrade cmake
            pip install -r docs/requirements.txt
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install . -v
+            pip install . -v
      - when:
          condition:
            not: << parameters.upload-docs >>
@@ -70,9 +64,9 @@ jobs:
                 git push -f origin gh-pages

  linux_build_and_test:
-    docker:
-      - image: cimg/python:3.9
-
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
    steps:
      - checkout
      - run:
@@ -84,34 +78,36 @@ jobs:
      - run:
          name: Install dependencies
          command: |
-            pip install --upgrade cmake
-            pip install nanobind==2.4.0
-            pip install numpy
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
            sudo apt-get update
-            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
+            curl -LsSf https://astral.sh/uv/install.sh | sh
      - run:
          name: Install Python package
          command: |
-            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python3 setup.py build_ext --inplace
-            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python3 setup.py develop
+            uv venv
+            uv pip install cmake
+            uv pip install -e ".[dev]" -v
      - run:
          name: Generate package stubs
          command: |
-            echo "stubs"
-            pip install typing_extensions
-            python setup.py generate_stubs 
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
      - run:
          name: Run Python tests
          command: |
-            python3 -m unittest discover python/tests -v
+            source .venv/bin/activate
+            python -m unittest discover python/tests -v
+            mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
      - run:
          name: Build CPP only
          command: |
-            mkdir -p build && cd build 
+            source .venv/bin/activate
+            mkdir -p build && cd build
            cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
            make -j `nproc`
      - run:
@@ -122,58 +118,63 @@ jobs:
    parameters:
      xcode_version:
        type: string
-        default: "15.2.0"
+        default: "16.2.0"
+      macosx_deployment_target:
+        type: string
+        default: ""
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: macos.m1.medium.gen1
+    environment:
+      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
+    resource_class: m2pro.medium
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
-            brew install python@3.9
-            brew install openmpi
-            python3.9 -m venv env
-            source env/bin/activate
-            pip install --upgrade pip
-            pip install --upgrade cmake
-            pip install nanobind==2.4.0
-            pip install numpy
-            pip install torch
-            pip install tensorflow
-            pip install unittest-xml-reporting
+            HOMEBREW_NO_AUTO_UPDATE=1 HOMEBREW_NO_INSTALL_CLEANUP=1 \
+              brew install openmpi uv
      - run:
          name: Install Python package
          command: |
-            source env/bin/activate
-            DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install -e . -v
+            uv venv --python 3.9
+            uv pip install \
+              nanobind==2.4.0 \
+              cmake \
+              numpy \
+              torch \
+              tensorflow \
+              unittest-xml-reporting
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+              uv pip install -e . -v
      - run:
          name: Generate package stubs
          command: |
-            source env/bin/activate
-            pip install typing_extensions
-            python setup.py generate_stubs 
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
      - run:
          name: Run Python tests
          command: |
-            source env/bin/activate
+            source .venv/bin/activate
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
-            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
      - run:
          name: Build example extension
          command: |
-            source env/bin/activate
+            source .venv/bin/activate
            cd examples/extensions
-            pip install -r requirements.txt
-            python setup.py build_ext -j8
+            uv pip install -r requirements.txt
+            uv run --no-project setup.py build_ext --inplace
+            uv run --no-project python test.py
      - store_test_results:
          path: test-results
      - run:
          name: Build CPP only
          command: |
-            source env/bin/activate
+            source .venv/bin/activate
            mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
      - run:
          name: Run CPP tests
@@ -182,7 +183,7 @@ jobs:
      - run:
          name: Build small binary
          command: |
-            source env/bin/activate
+            source .venv/bin/activate
            cd build/
            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
              -DBUILD_SHARED_LIBS=ON \
@@ -194,13 +195,60 @@ jobs:
      - run:
          name: Run Python tests with JIT
          command: |
-            source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
-              pip install -e . -v
+            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+              uv pip install -e .
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
              METAL_DEBUG_ERROR_MODE=0 \
-              python -m xmlrunner discover -v python/tests -o test-results/gpu_jit
+              uv run --no-project python -m xmlrunner discover \
+                -v python/tests \
+                -o test-results/gpu_jit
+
+  cuda_build_and_test:
+    parameters:
+      image_date:
+        type: string
+        default: "2023.11.1"
+    machine:
+      image: "linux-cuda-12:<< parameters.image_date >>"
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - restore_cache:
+          keys:
+            - cuda-<< parameters.image_date >>-{{ arch }}-
+      - run:
+          name: Install dependencies
+          command: |
+            sudo apt-get update
+            sudo apt-get install libcudnn9-dev-cuda-12
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
+            sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
+            rm -rf ccache-4.11.3-linux-x86_64
+            curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run:
+          name: Install Python package
+          command: |
+            uv venv
+            CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              uv pip install -e ".[dev]" -v
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
+            LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
+      - run:
+          name: CCache report
+          command: |
+            ccache --show-stats
+            ccache --zero-stats
+            ccache --max-size 400MB
+            ccache --cleanup
+      - save_cache:
+          key: cuda-<< parameters.image_date >>-{{ arch }}-{{ epoch }}
+          paths:
+            - /home/circleci/.cache/ccache

  build_release:
    parameters:
@@ -209,13 +257,18 @@ jobs:
        default: "3.9"
      xcode_version:
        type: string
-        default: "15.2.0"
+        default: "16.2.0"
      build_env:
        type: string
        default: ""
+      macosx_deployment_target:
+        type: string
+        default: ""
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: macos.m1.medium.gen1
+    resource_class: m2pro.medium
+    environment:
+      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
    steps:
      - checkout
      - run:
@@ -236,22 +289,30 @@ jobs:
          name: Install Python package
          command: |
            source env/bin/activate
-            DEV_RELEASE=1 \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
              pip install . -v
      - run:
          name: Generate package stubs
          command: |
            source env/bin/activate
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Build Python package
          command: |
            source env/bin/activate
-            << parameters.build_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              python -m build -w
+            python setup.py clean --all
+            << parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
+      - when:
+          condition:
+            equal: ["3.9", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  source env/bin/activate
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
      - when:
          condition: << parameters.build_env >>
          steps:
@@ -268,52 +329,100 @@ jobs:
      python_version:
        type: string
        default: "3.9"
-      extra_env:
+      build_env:
        type: string
-        default: "DEV_RELEASE=1"
-    docker:
-      - image: ubuntu:20.04
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
    steps:
      - checkout
      - run:
          name: Build wheel
          command: |
            PYTHON=python<< parameters.python_version >>
-            apt-get update
-            apt-get upgrade -y
-            DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
-            apt-get install -y apt-utils
-            apt-get install -y software-properties-common
-            add-apt-repository -y ppa:deadsnakes/ppa
-            apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
-            apt-get install -y libblas-dev liblapack-dev liblapacke-dev
-            apt-get install -y build-essential git
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            sudo apt-get update
+            TZ=Etc/UTC sudo apt-get -y install tzdata
+            sudo add-apt-repository -y ppa:deadsnakes/ppa
+            sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
            $PYTHON -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
-            pip install --upgrade setuptools
-            pip install numpy
            pip install auditwheel
            pip install patchelf
            pip install build
            pip install twine
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              pip install . -v
+            << parameters.build_env >> pip install ".[dev]" -v
            pip install typing_extensions
-            python setup.py generate_stubs 
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python -m build --wheel
-            auditwheel show dist/*
-            auditwheel repair dist/* --plat manylinux_2_31_x86_64
+            python setup.py generate_stubs
+            python setup.py clean --all
+            MLX_BUILD_STAGE=1 << parameters.build_env >> python -m build -w
+            bash python/scripts/repair_linux.sh
+      - when:
+          condition:
+            equal: ["3.9", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  source env/bin/activate
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 \
+                    python -m build -w
+                  auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload packages
+                command: |
+                  source env/bin/activate
+                  twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+
+  build_cuda_release:
+    parameters:
+      build_env:
+        type: string
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
+    steps:
+      - checkout
      - run:
-          name: Upload package
+          name: Build wheel
          command: |
-            source env/bin/activate
-            twine upload wheelhouse/*
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo apt-get update
+            sudo apt-get install cuda-toolkit-12-9 libcudnn9-dev-cuda-12
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install zip
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
+            export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+            << parameters.build_env >> MLX_BUILD_STAGE=2 \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              python -m build -w
+            bash python/scripts/repair_cuda.sh
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  twine upload wheelhouse/*.whl
      - store_artifacts:
          path: wheelhouse/

@@ -325,21 +434,23 @@ workflows:
            pattern: "^(?!pull/)[-\\w]+$"
            value: << pipeline.git.branch >>
        - not: << pipeline.parameters.nightly_build >>
-        - not: << pipeline.parameters.weekly_build >>
        - not: << pipeline.parameters.test_release >>
    jobs:
      - mac_build_and_test:
          matrix:
            parameters:
-              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
+              macosx_deployment_target: ["13.5", "14.0"]
      - linux_build_and_test
+      - cuda_build_and_test:
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
      - build_documentation 

  build_pypi_release:
    when:
      and:
        - not: << pipeline.parameters.nightly_build >>
-        - not: << pipeline.parameters.weekly_build >>
        - not: << pipeline.parameters.test_release >>
    jobs:
      - build_release:
@@ -351,8 +462,70 @@ workflows:
          matrix:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              xcode_version: ["15.0.0", "15.2.0"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
              build_env: ["PYPI_RELEASE=1"]
+              xcode_version: ["16.2.0", "15.0.0"]
+            exclude:
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.9"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.10"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.11"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.12"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.13"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+                build_env: "PYPI_RELEASE=1"
      - build_documentation:
          filters:
            tags:
@@ -360,6 +533,25 @@ workflows:
            branches:
              ignore: /.*/
          upload-docs: true
+      - build_linux_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              build_env: ["PYPI_RELEASE=1"]
+      - build_cuda_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              build_env: ["PYPI_RELEASE=1"]

  prb:
    when:
@@ -375,9 +567,14 @@ workflows:
          requires: [ hold ]
          matrix:
            parameters:
-              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
+              macosx_deployment_target: ["13.5", "14.0"]
      - linux_build_and_test:
          requires: [ hold ]
+      - cuda_build_and_test:
+          requires: [ hold ]
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
  nightly_build:
    when:
      and:
@@ -388,27 +585,140 @@ workflows:
          matrix:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              xcode_version: ["15.0.0", "15.2.0"]
-  weekly_build:
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              xcode_version: ["16.2.0", "15.0.0"]
+            exclude:
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.9"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.10"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.11"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.12"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.13"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+      - build_linux_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+      - build_cuda_release
+
+  build_dev_release:
    when:
      and:
        - equal: [ main, << pipeline.git.branch >> ]
-        - << pipeline.parameters.weekly_build >>
+        - << pipeline.parameters.test_release >>
    jobs:
      - build_release:
          matrix:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
              build_env: ["DEV_RELEASE=1"]
-  linux_test_release:
-    when:
-      and:
-        - equal: [ main, << pipeline.git.branch >> ]
-        - << pipeline.parameters.linux_release >>
-    jobs:
+              xcode_version: ["16.2.0", "15.0.0"]
+            exclude:
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.9"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.10"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.11"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.12"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.13"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+                build_env: "DEV_RELEASE=1"
      - build_linux_release:
          matrix:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              extra_env: ["PYPI_RELEASE=1"]
+              build_env: ["DEV_RELEASE=1"]
+      - build_cuda_release:
+          matrix:
+            parameters:
+              build_env: ["DEV_RELEASE=1"]
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+uv.lock

 # vim
 *.swp
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -19,6 +19,7 @@ MLX was developed with contributions from the following individuals:
 - Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.
 - Paul Paczuski: Improved stability of BCE loss calculation
 - Max-Heinrich Laves: Added `conv_transpose1d`, `conv_transpose2d`, and `conv_transpose3d` ops.
+- Gökdeniz Gülmez: Added the `Muon (MomentUm Orthogonalized by Newton-schulz)` optimizer.

 <a href="https://github.com/ml-explore/mlx/graphs/contributors">
  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,7 @@ if(NOT MLX_VERSION)
  string(REGEX MATCH "#define MLX_VERSION_PATCH ([0-9]+)" _ "${_mlx_h_version}")
  set(_patch ${CMAKE_MATCH_1})
  set(MLX_PROJECT_VERSION "${_major}.${_minor}.${_patch}")
+  set(MLX_VERSION ${MLX_PROJECT_VERSION})
 else()
  string(REGEX REPLACE "^([0-9]+\.[0-9]+\.[0-9]+).*" "\\1" MLX_PROJECT_VERSION
                       ${MLX_VERSION})
@@ -33,15 +34,16 @@ option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
 option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
 option(MLX_BUILD_CPU "Build cpu backend" ON)
+option(MLX_BUILD_CUDA "Build cuda backend" OFF)
 option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
 option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
 option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
 option(MLX_BUILD_BLAS_FROM_SOURCE "Build OpenBLAS from source code" OFF)
 option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
+option(MLX_USE_CCACHE "Use CCache for compilation cache when available" ON)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)
-
-add_compile_definitions("MLX_VERSION=${MLX_VERSION}")
+option(USE_SYSTEM_FMT "Use system's provided fmt library" OFF)

 # --------------------- Processor tests -------------------------
 message(
@@ -64,10 +66,17 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
      message(WARNING "Building for x86_64 arch is not officially supported.")
    endif()
  endif()
-
 else()
  set(MLX_BUILD_METAL OFF)
-  message(WARNING "MLX is prioritised for Apple silicon systems using macOS.")
+endif()
+
+if(MLX_USE_CCACHE)
+  find_program(CCACHE_PROGRAM ccache)
+  if(CCACHE_PROGRAM)
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+  endif()
 endif()

 # ----------------------------- Lib -----------------------------
@@ -77,7 +86,6 @@ include(FetchContent)
 cmake_policy(SET CMP0135 NEW)

 add_library(mlx)
-set_target_properties(mlx PROPERTIES COMPILE_WARNING_AS_ERROR ON)

 if(MLX_BUILD_METAL)
  set(METAL_LIB "-framework Metal")
@@ -85,6 +93,10 @@ if(MLX_BUILD_METAL)
  set(QUARTZ_LIB "-framework QuartzCore")
 endif()

+if(MLX_BUILD_CUDA)
+  enable_language(CUDA)
+endif()
+
 if(MLX_BUILD_METAL AND NOT METAL_LIB)
  message(STATUS "Metal not found. Unable to build GPU")
  set(MLX_BUILD_METAL OFF)
@@ -214,23 +226,13 @@ else()
  set(MLX_BUILD_ACCELERATE OFF)
 endif()

-find_package(MPI)
-if(MPI_FOUND)
-  execute_process(
-    COMMAND zsh "-c" "mpirun --version"
-    OUTPUT_VARIABLE MPI_VERSION
-    ERROR_QUIET)
-  if(${MPI_VERSION} MATCHES ".*Open MPI.*")
-    target_include_directories(mlx PRIVATE ${MPI_INCLUDE_PATH})
-  elseif(MPI_VERSION STREQUAL "")
-    set(MPI_FOUND FALSE)
-    message(
-      WARNING "MPI found but mpirun is not available. Building without MPI.")
-  else()
-    set(MPI_FOUND FALSE)
-    message(WARNING "MPI which is not OpenMPI found. Building without MPI.")
-  endif()
-endif()
+message(STATUS "Downloading json")
+FetchContent_Declare(
+  json
+  URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
+FetchContent_MakeAvailable(json)
+target_include_directories(
+  mlx PRIVATE $<BUILD_INTERFACE:${json_SOURCE_DIR}/single_include/nlohmann>)

 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)

@@ -238,12 +240,19 @@ target_include_directories(
  mlx PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
             $<INSTALL_INTERFACE:include>)

-FetchContent_Declare(
-  fmt
-  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-  GIT_TAG 10.2.1
-  EXCLUDE_FROM_ALL)
-FetchContent_MakeAvailable(fmt)
+# Do not add mlx_EXPORTS define for shared library.
+set_target_properties(mlx PROPERTIES DEFINE_SYMBOL "")
+
+if(USE_SYSTEM_FMT)
+  find_package(fmt REQUIRED)
+else()
+  FetchContent_Declare(
+    fmt
+    GIT_REPOSITORY https://github.com/fmtlib/fmt.git
+    GIT_TAG 10.2.1
+    EXCLUDE_FROM_ALL)
+  FetchContent_MakeAvailable(fmt)
+endif()
 target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:fmt::fmt-header-only>)

 if(MLX_BUILD_PYTHON_BINDINGS)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -5,26 +5,26 @@ possible.

 ## Pull Requests

-1. Fork and submit pull requests to the repo. 
+1. Fork and submit pull requests to the repo.
 2. If you've added code that should be tested, add tests.
 3. If a change is likely to impact efficiency, run some of the benchmarks before
   and after the change. Examples of benchmarks can be found in `benchmarks/python/`.
 4. If you've changed APIs, update the documentation.
-5. Every PR should have passing tests and at least one review. 
+5. Every PR should have passing tests and at least one review.
 6. For code formatting install `pre-commit` using something like `pip install pre-commit` and run `pre-commit install`.
   This should install hooks for running `black` and `clang-format` to ensure
   consistent style for C++ and python code.
- 
+
   You can also run the formatters manually as follows:
- 
-     ```
-     clang-format -i file.cpp
-     ```
- 
-     ```
-     black file.py
-     ```
- 
+
+   ```shell
+   clang-format -i file.cpp
+   ```
+
+   ```shell
+   black file.py
+   ```
+
   or run `pre-commit run --all-files` to check all files in the repo.

 ## Issues
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,6 @@
 include CMakeLists.txt
+include mlx.pc.in
 recursive-include mlx/ *
+include cmake/*
 include python/src/*
 include python/mlx/py.typed # support type hinting as in PEP-561
--- a/README.md
+++ b/README.md
@@ -11,10 +11,10 @@ brought to you by Apple machine learning research.

 Some key features of MLX include:

- - **Familiar APIs**: MLX has a Python API that closely follows NumPy.  MLX
+ - **Familiar APIs**: MLX has a Python API that closely follows NumPy. MLX
   also has fully featured C++, [C](https://github.com/ml-explore/mlx-c), and
   [Swift](https://github.com/ml-explore/mlx-swift/) APIs, which closely mirror
-   the Python API.  MLX has higher-level packages like `mlx.nn` and
+   the Python API. MLX has higher-level packages like `mlx.nn` and
   `mlx.optimizers` with APIs that closely follow PyTorch to simplify building
   more complex models.

@@ -68,18 +68,23 @@ in the documentation.

 ## Installation

-MLX is available on [PyPI](https://pypi.org/project/mlx/). To install the Python API, run:
+MLX is available on [PyPI](https://pypi.org/project/mlx/). To install MLX on
+macOS, run:

-**With `pip`**:
-
-```
+```bash
 pip install mlx
 ```

-**With `conda`**:
+To install the CUDA backend on Linux, run:

+```bash
+pip install mlx[cuda]
 ```
-conda install -c conda-forge mlx
+
+To install a CPU-only Linux package, run:
+
+```bash
+pip install mlx[cpu]
 ```

 Checkout the
--- a/benchmarks/cpp/irregular_strides.cpp
+++ b/benchmarks/cpp/irregular_strides.cpp
@@ -1,5 +1,6 @@
 // Copyright © 2023 Apple Inc.

+#include <cstring>
 #include <iostream>
 #include <sstream>

--- a/benchmarks/cpp/single_ops.cpp
+++ b/benchmarks/cpp/single_ops.cpp
@@ -192,6 +192,22 @@ void time_reductions() {

  auto argmin_along_1 = [&a]() { return mx::argmin(a, 1, false); };
  TIME(argmin_along_1);
+
+  auto indices = mx::array({1});
+  auto updates = mx::reshape(mx::array({NAN}), {1, 1, 1});
+  std::vector<int> axes{0};
+  auto b = scatter(a, {indices}, updates, axes);
+  mx::eval(b);
+
+  auto max_along_0 = [&b]() { return mx::max(b, 0, false); };
+  TIME(max_along_0);
+  auto max_along_1 = [&b]() { return mx::max(b, 1, false); };
+  TIME(max_along_1);
+
+  auto min_along_0 = [&b]() { return mx::min(b, 0, false); };
+  TIME(min_along_0);
+  auto min_along_1 = [&b]() { return mx::min(b, 1, false); };
+  TIME(min_along_1);
 }

 void time_gather_scatter() {
--- a/benchmarks/python/comparative/bench_torch.py
+++ b/benchmarks/python/comparative/bench_torch.py
@@ -5,6 +5,7 @@ import os
 import time

 import torch
+import torch.cuda
 import torch.mps


@@ -44,8 +45,10 @@ def bench(f, *args):


 def sync_if_needed(x):
-    if x.device != torch.device("cpu"):
+    if x.device == torch.device("mps"):
        torch.mps.synchronize()
+    elif x.device == torch.device("cuda"):
+        torch.cuda.synchronize()


@torch.no_grad()
@@ -99,6 +102,14 @@ def reduction(op, axis, x):
    sync_if_needed(x)


+@torch.no_grad()
+def sum_and_add(axis, x, y):
+    z = x.sum(axis=axis, keepdims=True)
+    for i in range(50):
+        z = (z + y).sum(axis=axis, keepdims=True)
+    sync_if_needed(x)
+
+
@torch.no_grad()
 def softmax(axis, x):
    ys = []
@@ -340,7 +351,11 @@ if __name__ == "__main__":
        args.axis.pop(0)

    torch.set_num_threads(1)
-    device = "cpu" if args.cpu else "mps"
+    device = "mps"
+    if torch.cuda.is_available():
+        device = "cuda"
+    if args.cpu:
+        device = "cpu"

    types = args.dtype
    if not types:
@@ -460,5 +475,8 @@ if __name__ == "__main__":
    elif args.benchmark == "selu":
        print(bench(selu, x))

+    elif args.benchmark == "sum_and_add":
+        print(bench(sum_and_add, axis, *xs))
+
    else:
        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")
--- a/benchmarks/python/conv_unaligned_bench.py
+++ b/benchmarks/python/conv_unaligned_bench.py
@@ -0,0 +1,107 @@
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_2D
+
+
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+
+    return pt_conv_2D
+
+
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("mps")
+
+    torch.mps.synchronize()
+
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    dtype = "float32"
+    shapes = (
+        (4, 32, 32, 21, 3, 3, 128),
+        (4, 32, 32, 21, 3, 3, 37),
+        (4, 32, 32, 370, 3, 3, 370),
+        (4, 32, 32, 370, 7, 7, 128),
+        (2, 320, 640, 21, 7, 7, 21),
+    )
+    for N, H, W, C, kh, kw, O in shapes:
+        time_mlx, time_torch = bench_shape(
+            N, H, W, C, kh, kw, O, (1, 1), (0, 0), 1, dtype
+        )
+        diff = time_torch / time_mlx - 1.0
+
+        print(
+            f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kh:2d}, {kw:2d}, {C:3d}), {dtype}, {100. * diff:+5.2f}%"
+        )
+        if time_mlx >= 2.0 * time_torch:
+            print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/gather_bench.py
+++ b/benchmarks/python/gather_bench.py
@@ -1,7 +1,6 @@
 # Copyright © 2023-2024 Apple Inc.

 import argparse
-from time import time

 import mlx.core as mx
 import torch
--- a/benchmarks/python/gather_mm_bench.py
+++ b/benchmarks/python/gather_mm_bench.py
@@ -0,0 +1,74 @@
+# Copyright © 2025 Apple Inc.
+
+import mlx.core as mx
+from time_utils import time_fn
+
+N = 1024
+D = 1024
+M = 1024
+E = 32
+I = 4
+
+
+def gather_sort(x, indices):
+    N, M = indices.shape
+    indices = indices.flatten()
+    order = mx.argsort(indices)
+    inv_order = mx.argsort(order)
+    return x.flatten(0, -3)[order // M], indices[order], inv_order
+
+
+def scatter_unsort(x, inv_order, shape=None):
+    x = x[inv_order]
+    if shape is not None:
+        x = mx.unflatten(x, 0, shape)
+    return x
+
+
+def gather_mm_simulate(x, w, indices):
+    x, idx, inv_order = gather_sort(x, indices)
+    for i in range(2):
+        y = mx.concatenate([x[i] @ w[j].T for i, j in enumerate(idx.tolist())], axis=0)
+        x = y[:, None]
+    x = scatter_unsort(x, inv_order, indices.shape)
+    return x
+
+
+def time_gather_mm():
+    x = mx.random.normal((N, 1, 1, D)) / 1024**0.5
+    w1 = mx.random.normal((E, M, D)) / 1024**0.5
+    w2 = mx.random.normal((E, D, M)) / 1024**0.5
+    indices = (mx.random.uniform(shape=(N, I)) * E).astype(mx.uint32)
+    sorted_indices = mx.sort(indices.flatten()).reshape(N, I)
+    mx.eval(x, w1, w2, indices, sorted_indices)
+
+    def gather_mm(x, w1, w2, indices, sort):
+        idx = indices
+        inv_order = None
+        if sort:
+            x, idx, inv_order = gather_sort(x, indices)
+        x = mx.gather_mm(x, w1.swapaxes(-1, -2), rhs_indices=idx, sorted_indices=sort)
+        x = mx.gather_mm(x, w2.swapaxes(-1, -2), rhs_indices=idx, sorted_indices=sort)
+        if sort:
+            x = scatter_unsort(x, inv_order, indices.shape)
+        return x
+
+    time_fn(gather_mm, x, w1, w2, indices, False)
+    time_fn(gather_mm, x, w1, w2, sorted_indices, False)
+    time_fn(gather_mm, x, w1, w2, indices, True)
+
+    x = mx.random.normal((N * I, D)) / 1024**0.5
+    w1 = mx.random.normal((M, D)) / 1024**0.5
+    w2 = mx.random.normal((D, M)) / 1024**0.5
+    mx.eval(x, w1, w2)
+
+    def equivalent_matmul(x, w1, w2):
+        x = x @ w1.T
+        x = x @ w2.T
+        return x
+
+    time_fn(equivalent_matmul, x, w1, w2)
+
+
+if __name__ == "__main__":
+    time_gather_mm()
--- a/benchmarks/python/gather_qmm_bench.py
+++ b/benchmarks/python/gather_qmm_bench.py
@@ -0,0 +1,84 @@
+# Copyright © 2025 Apple Inc.
+
+import mlx.core as mx
+from time_utils import time_fn
+
+N = 1024
+D = 1024
+M = 1024
+E = 32
+I = 4
+
+
+def gather_sort(x, indices):
+    N, M = indices.shape
+    indices = indices.flatten()
+    order = mx.argsort(indices)
+    inv_order = mx.argsort(order)
+    return x.flatten(0, -3)[order // M], indices[order], inv_order
+
+
+def scatter_unsort(x, inv_order, shape=None):
+    x = x[inv_order]
+    if shape is not None:
+        x = mx.unflatten(x, 0, shape)
+    return x
+
+
+def gather_mm_simulate(x, w, indices):
+    x, idx, inv_order = gather_sort(x, indices)
+    for i in range(2):
+        y = mx.concatenate(
+            [
+                mx.quantized_matmul(x[i], w[0][j], w[1][j], w[2][j], transpose=True)
+                for i, j in enumerate(idx.tolist())
+            ],
+            axis=0,
+        )
+        x = y[:, None]
+    x = scatter_unsort(x, inv_order, indices.shape)
+    return x
+
+
+def time_gather_qmm():
+    x = mx.random.normal((N, 1, 1, D)) / 1024**0.5
+    w1 = mx.random.normal((E, M, D)) / 1024**0.5
+    w2 = mx.random.normal((E, D, M)) / 1024**0.5
+    w1 = mx.quantize(w1)
+    w2 = mx.quantize(w2)
+    indices = (mx.random.uniform(shape=(N, I)) * E).astype(mx.uint32)
+    sorted_indices = mx.sort(indices.flatten()).reshape(N, I)
+    mx.eval(x, w1, w2, indices, sorted_indices)
+
+    def gather_mm(x, w1, w2, indices, sort):
+        idx = indices
+        inv_order = None
+        if sort:
+            x, idx, inv_order = gather_sort(x, indices)
+        x = mx.gather_qmm(x, *w1, transpose=True, rhs_indices=idx, sorted_indices=sort)
+        x = mx.gather_qmm(x, *w2, transpose=True, rhs_indices=idx, sorted_indices=sort)
+        if sort:
+            x = scatter_unsort(x, inv_order, indices.shape)
+        return x
+
+    time_fn(gather_mm, x, w1, w2, indices, False)
+    time_fn(gather_mm, x, w1, w2, sorted_indices, False)
+    time_fn(gather_mm, x, w1, w2, indices, True)
+
+    x = mx.random.normal((N * I, D)) / 1024**0.5
+    w1 = mx.random.normal((M, D)) / 1024**0.5
+    w2 = mx.random.normal((D, M)) / 1024**0.5
+    w1 = mx.quantize(w1)
+    w2 = mx.quantize(w2)
+    mx.eval(x, w1, w2)
+
+    def equivalent_matmul(x, w1, w2):
+        x = mx.quantized_matmul(x, *w1, transpose=True)
+        x = mx.quantized_matmul(x, *w2, transpose=True)
+        return x
+
+    time_fn(equivalent_matmul, x, w1, w2)
+
+
+if __name__ == "__main__":
+    time_gather_qmm()
--- a/benchmarks/python/layer_norm_bench.py
+++ b/benchmarks/python/layer_norm_bench.py
@@ -1,5 +1,7 @@
 # Copyright © 2023-2024 Apple Inc.

+from functools import partial
+
 import mlx.core as mx
 import mlx.nn as nn
 from time_utils import time_fn
@@ -18,51 +20,63 @@ def layer_norm(x, w, b, eps):
    return y


-def time_layer_norm():
+def time_layer_norm(N, dt):
+    L = 1024
    f1 = lambda x, w, b, y: (layer_norm(x, w, b, 1e-5) * y).sum()
    f2 = lambda x, w, b, y: (mx.fast.layer_norm(x, w, b, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0, 1, 2))
    g2 = mx.grad(f2, argnums=(0, 1, 2))

-    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
-    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    w = mx.random.uniform(shape=(N,)).astype(dt)
+    b = mx.random.uniform(shape=(N,)).astype(dt)
+    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
    mx.eval(x, w, b, y)

-    def layer_norm_loop(g, x, w, b):
+    def layer_norm_loop(f, x, w, b):
+        for _ in range(32):
+            x = f(x, w, b)
+        return x
+
+    time_fn(layer_norm_loop, partial(layer_norm, eps=1e-5), x, w, b)
+    time_fn(layer_norm_loop, partial(mx.fast.layer_norm, eps=1e-5), x, w, b)
+
+    def layer_norm_grad_loop(g, x, w, b):
        gx, gw, gb = x, w, b
        for _ in range(32):
            gx, gw, gb = g(gx, gw, gb, y)
        return gx, gw, gb

-    time_fn(layer_norm_loop, g1, x, w, b)
-    time_fn(layer_norm_loop, g2, x, w, b)
-    time_fn(layer_norm_loop, mx.compile(g1), x, w, b)
-    time_fn(layer_norm_loop, mx.compile(g2), x, w, b)
+    time_fn(layer_norm_grad_loop, g1, x, w, b)
+    time_fn(layer_norm_grad_loop, g2, x, w, b)
+    time_fn(layer_norm_grad_loop, mx.compile(g1), x, w, b)
+    time_fn(layer_norm_grad_loop, mx.compile(g2), x, w, b)

    f1 = lambda x, y: (layer_norm(x, None, None, 1e-5) * y).sum()
    f2 = lambda x, y: (mx.fast.layer_norm(x, None, None, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0,))
    g2 = mx.grad(f2, argnums=(0,))

-    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
-    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    w = mx.random.uniform(shape=(N,)).astype(dt)
+    b = mx.random.uniform(shape=(N,)).astype(dt)
+    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
    mx.eval(x, w, b, y)

-    def layer_norm_loop(g, x):
+    def layer_norm_grad_x_loop(g, x):
        gx = x
        for _ in range(32):
            gx = g(gx, y)
        return gx

-    time_fn(layer_norm_loop, g1, x)
-    time_fn(layer_norm_loop, g2, x)
-    time_fn(layer_norm_loop, mx.compile(g1), x)
-    time_fn(layer_norm_loop, mx.compile(g2), x)
+    time_fn(layer_norm_grad_x_loop, g1, x)
+    time_fn(layer_norm_grad_x_loop, g2, x)
+    time_fn(layer_norm_grad_x_loop, mx.compile(g1), x)
+    time_fn(layer_norm_grad_x_loop, mx.compile(g2), x)


 if __name__ == "__main__":
-    time_layer_norm()
+    for dt in [mx.float32, mx.float16, mx.bfloat16]:
+        for n in [1024, 2048, 4096, 8192, 8192 + 1024]:
+            print(dt, n)
+            time_layer_norm(n, dt)
--- a/benchmarks/python/sdpa_bench.py
+++ b/benchmarks/python/sdpa_bench.py
@@ -28,11 +28,34 @@ def bench(f, *args):
    return (e - s) * 1e-9


-def mlx_sdpa_fused_inner(q, k, v, scale):
-    return mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask=None)
+def prepare_inputs(B, qL, kL, D, qH, kH, mask, transpose, dtype):
+    np_dtype = getattr(np, dtype)
+
+    shape_q = (B, qL, qH, D) if transpose else (B, qH, qL, D)
+    shape_kv = (B, kL, kH, D) if transpose else (B, kH, kL, D)
+
+    scale = 1.0 / math.sqrt(D)
+
+    q_np = np.random.normal(0.0, 1.0, shape_q).astype(np_dtype)
+    k_np = np.random.normal(0.0, scale, shape_kv).astype(np_dtype)
+    v_np = np.random.normal(0.0, scale, shape_kv).astype(np_dtype)
+
+    q_mx = mx.array(q_np)
+    k_mx = mx.array(k_np)
+    v_mx = mx.array(v_np)
+
+    if mask is not None:
+        if mask == "additive":
+            mask_np = np.random.normal(0.0, 1.0, (B, qH, qL, kL)).astype(np_dtype)
+            mask = mx.array(mask_np)
+        elif mask == "bool":
+            mask_np = np.random.uniform(0.0, 1.0, (B, qH, qL, kL)) < 0.5
+            mask = mx.array(mask_np)
+
+    return q_mx, k_mx, v_mx, scale, mask


-def mlx_sdpa_unfused_inner(q, k, v, scale, f32softmax=False):
+def mlx_ref_attn(q, k, v, scale=1.0, mask=None):
    q_dtype = q.dtype
    q = q * mx.array(scale, q_dtype)
    n_q_heads = q.shape[-3]
@@ -41,6 +64,7 @@ def mlx_sdpa_unfused_inner(q, k, v, scale, f32softmax=False):

    B = q.shape[0]
    L = q.shape[2]
+    kL = k.shape[2]

    if n_repeats > 1:
        q = mx.reshape(q, [B, n_kv_heads, n_repeats, L, -1])
@@ -48,10 +72,27 @@ def mlx_sdpa_unfused_inner(q, k, v, scale, f32softmax=False):
        v = mx.expand_dims(v, 2)

    scores = q @ mx.swapaxes(k, -1, -2)
-    if f32softmax:
-        scores = mx.softmax(scores.astype(mx.float32), axis=-1).astype(q_dtype)
-    else:
-        scores = mx.softmax(scores, axis=-1)
+
+    if mask is not None:
+
+        if mask == "causal":
+            q_offset = max(0, kL - L)
+            q_indices = mx.arange(q_offset, q_offset + L)
+            k_indices = mx.arange(kL)
+            mask = q_indices[:, None] >= k_indices[None]
+
+        if n_repeats > 1 and mask.ndim >= 3:
+            if mask.shape[-3] == 1:
+                mask = mx.expand_dims(mask, -3)
+            else:
+                mask = mx.unflatten(mask, -3, (n_kv_heads, n_repeats))
+
+        if mask.dtype == mx.bool_:
+            scores = mx.where(mask, scores, -np.float32(np.inf))
+        else:
+            scores += mask
+
+    scores = mx.softmax(scores, axis=-1, precise=True)

    out = scores @ v
    if n_repeats > 1:
@@ -60,74 +101,55 @@ def mlx_sdpa_unfused_inner(q, k, v, scale, f32softmax=False):
    return out


-def mlx_spda_unfused(q, k, v, scale, transpose):
-    q_out = q
+def mlx_fused_attn(q, k, v, scale, mask):
+    return mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask=mask)
+
+
+def do_attention(f, q, k, v, scale, mask=None, transpose=False):
    if transpose:
-        k = mx.transpose(k, (0, 2, 1, 3))
-        v = mx.transpose(v, (0, 2, 1, 3))
+        q_t = mx.transpose(q, (0, 2, 1, 3))
+        k_t = mx.transpose(k, (0, 2, 1, 3))
+        v_t = mx.transpose(v, (0, 2, 1, 3))
+        o_t = f(q_t, k_t, v_t, scale=scale, mask=mask)
+        return mx.transpose(o_t, (0, 2, 1, 3))
+    else:
+        return f(q, k, v, scale=scale, mask=mask)
+
+
+def do_attention_bench(f, q, k, v, scale, mask=None, transpose=False):
+    q_out = q

    for i in range(N_iter_func):
-        if transpose:
-            q_out = mx.transpose(q_out, (0, 2, 1, 3))
-        q_out = mlx_sdpa_unfused_inner(q_out, k, v, scale)
-        if transpose:
-            q_out = mx.transpose(q_out, (0, 2, 1, 3))
+        q_out = do_attention(f, q_out, k, v, scale, mask=mask, transpose=transpose)

    mx.eval(q_out)
    return q_out


-def mlx_spda_fused(q, k, v, scale, transpose):
-    q_out = q
-    if transpose:
-        k = mx.transpose(k, (0, 2, 1, 3))
-        v = mx.transpose(v, (0, 2, 1, 3))
-
-    for i in range(N_iter_func):
-        if transpose:
-            q_out = mx.transpose(q_out, (0, 2, 1, 3))
-        q_out = mlx_sdpa_fused_inner(q_out, k, v, scale)
-        if transpose:
-            q_out = mx.transpose(q_out, (0, 2, 1, 3))
-
-    mx.eval(q_out)
-    return q_out
-
-
-def bench_shape(B, qsl, ksl, head_dim, n_q_heads, n_kv_heads, np_dtype, transpose=True):
-    shape_q = (
-        (B, qsl, n_q_heads, head_dim) if transpose else (B, n_q_heads, qsl, head_dim)
-    )
-    shape_kv = (
-        (B, ksl, n_kv_heads, head_dim) if transpose else (B, n_kv_heads, ksl, head_dim)
+def bench_shape(
+    B, qsl, ksl, head_dim, n_q_heads, n_kv_heads, dtype, transpose=True, mask_in=None
+):
+    q_mx, k_mx, v_mx, scale, mask = prepare_inputs(
+        B, qsl, ksl, head_dim, n_q_heads, n_kv_heads, mask_in, transpose, dtype
    )

-    q_np = np.random.normal(0.0, 1.0 / math.sqrt(head_dim), shape_q).astype(np_dtype)
-    k_np = np.random.normal(0.0, 1.0 / math.sqrt(head_dim), shape_kv).astype(np_dtype)
-    v_np = np.random.normal(0.0, 1.0 / math.sqrt(head_dim), shape_kv).astype(np_dtype)
+    time_mlx_unfused = bench(
+        do_attention_bench, mlx_ref_attn, q_mx, k_mx, v_mx, scale, mask, transpose
+    )
+    time_mlx_fused = bench(
+        do_attention_bench, mlx_fused_attn, q_mx, k_mx, v_mx, scale, mask, transpose
+    )

-    scale = math.sqrt(1.0 / head_dim)
+    o_mlx_fused = do_attention(mlx_ref_attn, q_mx, k_mx, v_mx, scale, mask, transpose)
+    o_mlx_unfused = do_attention(
+        mlx_fused_attn, q_mx, k_mx, v_mx, scale, mask, transpose
+    )

-    q_mx = mx.array(q_np)
-    k_mx = mx.array(k_np)
-    v_mx = mx.array(v_np)
+    atol = 1e-5 if dtype == "float32" else 2e-4

-    time_mlx_unfused = bench(mlx_spda_unfused, q_mx, k_mx, v_mx, scale, transpose)
-    time_mlx_fused = bench(mlx_spda_fused, q_mx, k_mx, v_mx, scale, transpose)
-
-    if transpose:
-        q_mx = mx.transpose(q_mx, (0, 2, 1, 3))
-        k_mx = mx.transpose(k_mx, (0, 2, 1, 3))
-        v_mx = mx.transpose(v_mx, (0, 2, 1, 3))
-
-    o_mlx_fused = mlx_sdpa_fused_inner(q_mx, k_mx, v_mx, scale)
-    o_mlx_unfused = mlx_sdpa_unfused_inner(q_mx, k_mx, v_mx, scale, f32softmax=True)
-
-    atol = 1e-5 if np_dtype == np.float32 else 1e-4
-
-    if not mx.allclose(o_mlx_fused, o_mlx_unfused, atol=atol):
+    if not mx.allclose(o_mlx_fused, o_mlx_unfused, atol=atol, rtol=atol):
        print(
-            f"Failed at (B: {B}, qsl: {qsl}, ksl: {ksl}, head_dim: {head_dim}, n_qh: {n_q_heads}, n_kvh: {n_kv_heads}) [tpose = {transpose}] with max(|a - b|) = {mx.max(mx.abs(o_mlx_unfused - o_mlx_fused)):3.2e}"
+            f"Failed at (B: {B}, qsl: {qsl}, ksl: {ksl}, head_dim: {head_dim}, n_qh: {n_q_heads}, n_kvh: {n_kv_heads}, mask: {mask_in}) [tpose = {transpose}] with max(|a - b|) = {mx.max(mx.abs(o_mlx_unfused - o_mlx_fused)):3.2e}"
        )

    return time_mlx_fused, time_mlx_unfused
@@ -151,39 +173,51 @@ if __name__ == "__main__":
          (  1,   128,   128,       64,   32,    32),
          (  1,   256,   256,       64,   32,    32),
          (  1,   512,   512,       64,   32,    32),
-          (  1,  1024,  1024,       64,   32,    32),
-          (  1,  2048,  2048,       64,   32,    32),
-          (  1,  4096,  4096,       64,   32,    32),
+          (  1,  1024,  1024,       64,   32,     8),
+          (  1,  2048,  2048,       64,   32,     8),
+          (  1,  4096,  4096,       64,   32,     8),
    )

    shapes_80 = (
        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
-          (  1,  1024,  1024,       80,   32,    32),
-          (  1,  2048,  2048,       80,   32,    32),
-          (  1,  4096,  4096,       80,   32,    32),
+          (  1,  1024,  1024,       80,   32,     8),
+          (  1,  2048,  2048,       80,   32,     8),
+          (  1,  4096,  4096,       80,   32,     8),
    )

    shapes_128 = (
        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
-          (  1,  1024,  1024,      128,   32,    32),
-          (  1,  2048,  2048,      128,   32,    32),
-          (  1,  4096,  4096,      128,   32,    32),
+          (  1,  1024,  1024,      128,   32,     8),
+          (  1,  2048,  2048,      128,   32,     8),
+          (  1,  4096,  4096,      128,   32,     8),
    )
    # fmt: on

    shapes = shapes_64 + shapes_80 + shapes_128

-    print("  B,   qsl,   ksl, hdim, n_qh, n_kvh, tpose,   dtype, t_unfs, t_fuse, diff%")
+    masks = [None, "bool", "causal"]
+
+    print(
+        "  B,   qsl,   ksl, hdim, n_qh, n_kvh, t,   dtype,     mask, t_unfs, t_fuse, diff%"
+    )

    for dtype in dtypes:
        for transpose in transposes:
            for B, qsl, ksl, head_dim, n_q_heads, n_kv_heads in shapes:
-                np_dtype = getattr(np, dtype)
-                time_mlx_fused, time_mlx_unfused = bench_shape(
-                    B, qsl, ksl, head_dim, n_q_heads, n_kv_heads, np_dtype, transpose
-                )
-                diff = time_mlx_unfused / time_mlx_fused - 1.0
-                t_str = 1 if transpose else 0
-                print(
-                    f"{B:3d}, {qsl:5d}, {ksl:5d}, {head_dim:4d}, {n_q_heads:4d}, {n_kv_heads:5d}, {t_str:5d}, {dtype}, {time_mlx_unfused: 2.3f}, {time_mlx_fused: 2.3f}, {100. * diff:+5.2f}%"
-                )
+                for mask_in in masks:
+                    time_mlx_fused, time_mlx_unfused = bench_shape(
+                        B,
+                        qsl,
+                        ksl,
+                        head_dim,
+                        n_q_heads,
+                        n_kv_heads,
+                        dtype,
+                        transpose,
+                        mask_in,
+                    )
+                    diff = time_mlx_unfused / time_mlx_fused - 1.0
+                    t_str = 1 if transpose else 0
+                    print(
+                        f"{B:3d}, {qsl:5d}, {ksl:5d}, {head_dim:4d}, {n_q_heads:4d}, {n_kv_heads:5d}, {t_str:1d}, {dtype}, {str(mask_in):>8}, {time_mlx_unfused: 2.3f}, {time_mlx_fused: 2.3f}, {100. * diff:+5.2f}%"
+                    )
--- a/benchmarks/python/single_ops.py
+++ b/benchmarks/python/single_ops.py
@@ -51,6 +51,20 @@ def time_maximum():
    time_fn(mx.maximum, a, b)


+def time_max():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    a[1, 1] = mx.nan
+    mx.eval(a)
+    time_fn(mx.max, a, 0)
+
+
+def time_min():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    a[1, 1] = mx.nan
+    mx.eval(a)
+    time_fn(mx.min, a, 0)
+
+
 def time_negative():
    a = mx.random.uniform(shape=(10000, 1000))
    mx.eval(a)
@@ -108,6 +122,8 @@ if __name__ == "__main__":

    time_add()
    time_matmul()
+    time_min()
+    time_max()
    time_maximum()
    time_exp()
    time_negative()
--- a/cmake/extension.cmake
+++ b/cmake/extension.cmake
@@ -11,13 +11,14 @@ include(CMakeParseArguments)
 # Args: TARGET: Custom target to be added for the metal library TITLE: Name of
 # the .metallib OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib SOURCES: List
 # of source files INCLUDE_DIRS: List of include dirs DEPS: List of dependency
-# files (like headers)
+# files (like headers) DEBUG: Boolean, if true, enables debug compile options
+# for this specific library. If not provided, uses global MLX_METAL_DEBUG.
 #
 # clang format on

 macro(mlx_build_metallib)
  # Parse args
-  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY)
+  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY DEBUG)
  set(multiValueArgs SOURCES INCLUDE_DIRS DEPS)
  cmake_parse_arguments(MTLLIB "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -26,6 +27,10 @@ macro(mlx_build_metallib)

  # Collect compile options
  set(MTLLIB_COMPILE_OPTIONS -Wall -Wextra -fno-fast-math -Wno-c++17-extensions)
+  if(MLX_METAL_DEBUG OR MTLLIB_DEBUG)
+    set(MTLLIB_COMPILE_OPTIONS ${MTLLIB_COMPILE_OPTIONS} -gline-tables-only
+                               -frecord-sources)
+  endif()

  # Prepare metallib build command
  add_custom_command(
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -13,7 +13,7 @@ EXCLUDE_PATTERNS       = */private/*
 CREATE_SUBDIRS         = NO
 FULL_PATH_NAMES        = YES
 RECURSIVE              = YES
-GENERATE_HTML          = YES
+GENERATE_HTML          = NO
 GENERATE_LATEX         = NO
 GENERATE_XML           = YES
 XML_PROGRAMLISTING     = YES
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -10,7 +10,7 @@ import mlx.core as mx
 # -- Project information -----------------------------------------------------

 project = "MLX"
-copyright = "2023, MLX Contributors"
+copyright = "2023, Apple"
 author = "MLX Contributors"
 version = ".".join(mx.__version__.split(".")[:3])
 release = version
--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@@ -8,23 +8,26 @@ MLX supports writing custom Metal kernels through the Python and C++ APIs.
 Simple Example
 --------------

+.. currentmodule:: mlx.core
+
 Let's write a custom kernel that computes ``exp`` elementwise:

 .. code-block:: python

-  def exp_elementwise(a: mx.array):
-      source = """
-          uint elem = thread_position_in_grid.x;
-          T tmp = inp[elem];
-          out[elem] = metal::exp(tmp);
-      """
+  source = """
+      uint elem = thread_position_in_grid.x;
+      T tmp = inp[elem];
+      out[elem] = metal::exp(tmp);
+  """

-      kernel = mx.fast.metal_kernel(
-          name="myexp",
-          input_names=["inp"],
-          output_names=["out"],
-          source=source,
-      )
+  kernel = mx.fast.metal_kernel(
+      name="myexp",
+      input_names=["inp"],
+      output_names=["out"],
+      source=source,
+  )
+
+  def exp_elementwise(a: mx.array):
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@@ -39,8 +42,13 @@ Let's write a custom kernel that computes ``exp`` elementwise:
  b = exp_elementwise(a)
  assert mx.allclose(b, mx.exp(a))

+Every time you make a kernel, a new Metal library is created and possibly
+JIT compiled. To reduce the overhead from that, build the kernel once with
+:func:`fast.metal_kernel` and then use it many times.
+
 .. note::
-    We are only required to pass the body of the Metal kernel in ``source``.
+   Only pass the body of the Metal kernel in ``source``. The function
+   signature is generated automatically.

 The full function signature will be generated using:

@@ -78,44 +86,51 @@ Putting this all together, the generated function signature for ``myexp`` is as

  template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;

-Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads <https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_ function.
-This means we will launch ``mx.prod(grid)`` threads, subdivided into ``threadgroup`` size threadgroups.
-For optimal performance, each thread group dimension should be less than or equal to the corresponding grid dimension.
+Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads
+<https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_
+function. This means we will launch ``mx.prod(grid)`` threads, subdivided into
+``threadgroup`` size threadgroups.  For optimal performance, each thread group
+dimension should be less than or equal to the corresponding grid dimension.

-Passing ``verbose=True`` to ``mx.fast.metal_kernel.__call__`` will print the generated code for debugging purposes.
+Passing ``verbose=True`` to :func:`ast.metal_kernel.__call__` will print the
+generated code for debugging purposes.

 Using Shape/Strides
 -------------------

-``mx.fast.metal_kernel`` supports an argument ``ensure_row_contiguous`` which is ``True`` by default.
-This will copy the ``mx.array`` inputs if needed before the kernel is launched to ensure that the memory layout is row contiguous.
-Generally this makes writing the kernel easier, since we don't have to worry about gaps or the ordering of the dims
-when indexing.
+:func:`fast.metal_kernel` supports an argument ``ensure_row_contiguous`` which
+is ``True`` by default. This will copy the array inputs if needed
+before the kernel is launched to ensure that the memory layout is row
+contiguous.  Generally this makes writing the kernel easier, since we don't
+have to worry about gaps or the ordering of the dims when indexing.

-If we want to avoid this copy, ``metal_kernel`` automatically passes ``a_shape``, ``a_strides`` and ``a_ndim`` for each
-input array ``a`` if any are present in ``source``.
-We can then use MLX's built in indexing utils to fetch the right elements for each thread.
+If we want to avoid this copy, :func:`fast.metal_kernel` automatically passes
+``a_shape``, ``a_strides`` and ``a_ndim`` for each input array ``a`` if any are
+present in ``source``. We can then use MLX's built in indexing utils to fetch
+the right elements for each thread.

-Let's convert ``myexp`` above to support arbitrarily strided arrays without relying on a copy from ``ensure_row_contiguous``:
+Let's convert ``myexp`` above to support arbitrarily strided arrays without
+relying on a copy from ``ensure_row_contiguous``:

 .. code-block:: python
+   
+  source = """
+      uint elem = thread_position_in_grid.x;
+      // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
+      uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
+      T tmp = inp[loc];
+      // Output arrays are always row contiguous
+      out[elem] = metal::exp(tmp);
+  """
+
+  kernel = mx.fast.metal_kernel(
+      name="myexp_strided",
+      input_names=["inp"],
+      output_names=["out"],
+      source=source
+  )

  def exp_elementwise(a: mx.array):
-      source = """
-          uint elem = thread_position_in_grid.x;
-          // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
-          uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
-          T tmp = inp[loc];
-          // Output arrays are always row contiguous
-          out[elem] = metal::exp(tmp);
-      """
-
-      kernel = mx.fast.metal_kernel(
-          name="myexp_strided",
-          input_names=["inp"],
-          output_names=["out"],
-          source=source
-      )
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@@ -142,137 +157,139 @@ We'll start with the following MLX implementation using standard ops:

 .. code-block:: python

-    def grid_sample_ref(x, grid):
-        N, H_in, W_in, _ = x.shape
-        ix = ((grid[..., 0] + 1) * W_in - 1) / 2
-        iy = ((grid[..., 1] + 1) * H_in - 1) / 2
+  def grid_sample_ref(x, grid):
+      N, H_in, W_in, _ = x.shape
+      ix = ((grid[..., 0] + 1) * W_in - 1) / 2
+      iy = ((grid[..., 1] + 1) * H_in - 1) / 2

-        ix_nw = mx.floor(ix).astype(mx.int32)
-        iy_nw = mx.floor(iy).astype(mx.int32)
+      ix_nw = mx.floor(ix).astype(mx.int32)
+      iy_nw = mx.floor(iy).astype(mx.int32)

-        ix_ne = ix_nw + 1
-        iy_ne = iy_nw
+      ix_ne = ix_nw + 1
+      iy_ne = iy_nw

-        ix_sw = ix_nw
-        iy_sw = iy_nw + 1
+      ix_sw = ix_nw
+      iy_sw = iy_nw + 1

-        ix_se = ix_nw + 1
-        iy_se = iy_nw + 1
+      ix_se = ix_nw + 1
+      iy_se = iy_nw + 1

-        nw = (ix_se - ix)    * (iy_se - iy)
-        ne = (ix    - ix_sw) * (iy_sw - iy)
-        sw = (ix_ne - ix)    * (iy    - iy_ne)
-        se = (ix    - ix_nw) * (iy    - iy_nw)
+      nw = (ix_se - ix)    * (iy_se - iy)
+      ne = (ix    - ix_sw) * (iy_sw - iy)
+      sw = (ix_ne - ix)    * (iy    - iy_ne)
+      se = (ix    - ix_nw) * (iy    - iy_nw)

-        I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
-        I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
-        I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
-        I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]
+      I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
+      I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
+      I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
+      I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]

-        mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
-        mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
-        mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
-        mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)
+      mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
+      mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
+      mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
+      mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)

-        I_nw *= mask_nw[..., None]
-        I_ne *= mask_ne[..., None]
-        I_sw *= mask_sw[..., None]
-        I_se *= mask_se[..., None]
+      I_nw *= mask_nw[..., None]
+      I_ne *= mask_ne[..., None]
+      I_sw *= mask_sw[..., None]
+      I_se *= mask_se[..., None]

-        output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se
+      output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se

-        return output
+      return output

-Now let's use ``mx.custom_function`` together with ``mx.fast.metal_kernel``
+Now let's use :func:`custom_function` together with :func:`fast.metal_kernel`
 to write a fast GPU kernel for both the forward and backward passes.

 First we'll implement the forward pass as a fused kernel:

 .. code-block:: python

-    @mx.custom_function
-    def grid_sample(x, grid):
+  source = """
+      uint elem = thread_position_in_grid.x;
+      int H = x_shape[1];
+      int W = x_shape[2];
+      int C = x_shape[3];
+      int gH = grid_shape[1];
+      int gW = grid_shape[2];

-        assert x.ndim == 4, "`x` must be 4D."
-        assert grid.ndim == 4, "`grid` must be 4D."
+      int w_stride = C;
+      int h_stride = W * w_stride;
+      int b_stride = H * h_stride;

-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
-        out_shape = (B, gN, gM, C)
+      uint grid_idx = elem / C * 2;
+      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

-        assert D == 2, "Last dim of `grid` must be size 2."
+      int ix_nw = floor(ix);
+      int iy_nw = floor(iy);

-        source = """
-            uint elem = thread_position_in_grid.x;
-            int H = x_shape[1];
-            int W = x_shape[2];
-            int C = x_shape[3];
-            int gH = grid_shape[1];
-            int gW = grid_shape[2];
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;

-            int w_stride = C;
-            int h_stride = W * w_stride;
-            int b_stride = H * h_stride;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;

-            uint grid_idx = elem / C * 2;
-            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;

-            int ix_nw = floor(ix);
-            int iy_nw = floor(iy);
+      T nw = (ix_se - ix)    * (iy_se - iy);
+      T ne = (ix    - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix)    * (iy    - iy_ne);
+      T se = (ix    - ix_nw) * (iy    - iy_nw);

-            int ix_ne = ix_nw + 1;
-            int iy_ne = iy_nw;
+      int batch_idx = elem / C / gH / gW * b_stride;
+      int channel_idx = elem % C;
+      int base_idx = batch_idx + channel_idx;

-            int ix_sw = ix_nw;
-            int iy_sw = iy_nw + 1;
+      T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
+      T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
+      T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
+      T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];

-            int ix_se = ix_nw + 1;
-            int iy_se = iy_nw + 1;
+      I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
+      I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
+      I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
+      I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;

-            T nw = (ix_se - ix)    * (iy_se - iy);
-            T ne = (ix    - ix_sw) * (iy_sw - iy);
-            T sw = (ix_ne - ix)    * (iy    - iy_ne);
-            T se = (ix    - ix_nw) * (iy    - iy_nw);
+      out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
+  """

-            int batch_idx = elem / C / gH / gW * b_stride;
-            int channel_idx = elem % C;
-            int base_idx = batch_idx + channel_idx;
+  kernel = mx.fast.metal_kernel(
+      name="grid_sample",
+      input_names=["x", "grid"],
+      output_names=["out"],
+      source=source,
+  )

-            T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
-            T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
-            T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
-            T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];
+  @mx.custom_function
+  def grid_sample(x, grid):

-            I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
-            I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
-            I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
-            I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;
+      assert x.ndim == 4, "`x` must be 4D."
+      assert grid.ndim == 4, "`grid` must be 4D."

-            out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
-        """
-        kernel = mx.fast.metal_kernel(
-            name="grid_sample",
-            input_names=["x", "grid"],
-            output_names=["out"],
-            source=source,
-        )
-        outputs = kernel(
-            inputs=[x, grid],
-            template=[("T", x.dtype)],
-            output_shapes=[out_shape],
-            output_dtypes=[x.dtype],
-            grid=(np.prod(out_shape), 1, 1),
-            threadgroup=(256, 1, 1),
-        )
-        return outputs[0]
+      B, _, _, C = x.shape
+      _, gN, gM, D = grid.shape
+      out_shape = (B, gN, gM, C)
+
+      assert D == 2, "Last dim of `grid` must be size 2."
+
+      outputs = kernel(
+          inputs=[x, grid],
+          template=[("T", x.dtype)],
+          output_shapes=[out_shape],
+          output_dtypes=[x.dtype],
+          grid=(np.prod(out_shape), 1, 1),
+          threadgroup=(256, 1, 1),
+      )
+      return outputs[0]

 For a reasonably sized input such as:

 .. code-block:: python

-    x.shape = (8, 1024, 1024, 64)
-    grid.shape = (8, 256, 256, 2)
+  x.shape = (8, 1024, 1024, 64)
+  grid.shape = (8, 256, 256, 2)

 On an M1 Max, we see a big performance improvement:

@@ -281,11 +298,11 @@ On an M1 Max, we see a big performance improvement:
 Grid Sample VJP
 ---------------

-Since we decorated ``grid_sample`` with ``mx.custom_function``, we can now define
-its custom vjp transform so MLX can differentiate it.
+Since we decorated ``grid_sample`` with :func:`custom_function`, we can now
+define its custom vjp transform so MLX can differentiate it.

 The backwards pass requires atomically updating ``x_grad``/``grid_grad`` and so
-requires a few extra ``mx.fast.metal_kernel`` features:
+requires a few extra :func:`fast.metal_kernel` features:

 * ``init_value=0``
    Initialize all of the kernel's outputs to this value before it runs. This allows us to update only part of the output arrays with the kernel.
@@ -299,128 +316,129 @@ We can then implement the backwards pass as follows:

 .. code-block:: python

-    @grid_sample.vjp
-    def grid_sample_vjp(primals, cotangent, _):
-        x, grid = primals
-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
+  source = """
+      uint elem = thread_position_in_grid.x;
+      int H = x_shape[1];
+      int W = x_shape[2];
+      int C = x_shape[3];
+      // Pad C to the nearest larger simdgroup size multiple
+      int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;

-        assert D == 2, "Last dim of `grid` must be size 2."
+      int gH = grid_shape[1];
+      int gW = grid_shape[2];

-        source = """
-            uint elem = thread_position_in_grid.x;
-            int H = x_shape[1];
-            int W = x_shape[2];
-            int C = x_shape[3];
-            // Pad C to the nearest larger simdgroup size multiple
-            int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;
+      int w_stride = C;
+      int h_stride = W * w_stride;
+      int b_stride = H * h_stride;

-            int gH = grid_shape[1];
-            int gW = grid_shape[2];
+      uint grid_idx = elem / C_padded * 2;
+      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

-            int w_stride = C;
-            int h_stride = W * w_stride;
-            int b_stride = H * h_stride;
+      int ix_nw = floor(ix);
+      int iy_nw = floor(iy);

-            uint grid_idx = elem / C_padded * 2;
-            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;

-            int ix_nw = floor(ix);
-            int iy_nw = floor(iy);
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;

-            int ix_ne = ix_nw + 1;
-            int iy_ne = iy_nw;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;

-            int ix_sw = ix_nw;
-            int iy_sw = iy_nw + 1;
+      T nw = (ix_se - ix)    * (iy_se - iy);
+      T ne = (ix    - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix)    * (iy    - iy_ne);
+      T se = (ix    - ix_nw) * (iy    - iy_nw);

-            int ix_se = ix_nw + 1;
-            int iy_se = iy_nw + 1;
+      int batch_idx = elem / C_padded / gH / gW * b_stride;
+      int channel_idx = elem % C_padded;
+      int base_idx = batch_idx + channel_idx;

-            T nw = (ix_se - ix)    * (iy_se - iy);
-            T ne = (ix    - ix_sw) * (iy_sw - iy);
-            T sw = (ix_ne - ix)    * (iy    - iy_ne);
-            T se = (ix    - ix_nw) * (iy    - iy_nw);
+      T gix = T(0);
+      T giy = T(0);
+      if (channel_idx < C) {
+          int cot_index = elem / C_padded * C + channel_idx;
+          T cot = cotangent[cot_index];
+          if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
+              int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);

-            int batch_idx = elem / C_padded / gH / gW * b_stride;
-            int channel_idx = elem % C_padded;
-            int base_idx = batch_idx + channel_idx;
+              T I_nw = x[offset];
+              gix -= I_nw * (iy_se - iy) * cot;
+              giy -= I_nw * (ix_se - ix) * cot;
+          }
+          if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
+              int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);

-            T gix = T(0);
-            T giy = T(0);
-            if (channel_idx < C) {
-                int cot_index = elem / C_padded * C + channel_idx;
-                T cot = cotangent[cot_index];
-                if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
-                    int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);
+              T I_ne = x[offset];
+              gix += I_ne * (iy_sw - iy) * cot;
+              giy -= I_ne * (ix - ix_sw) * cot;
+          }
+          if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
+              int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);

-                    T I_nw = x[offset];
-                    gix -= I_nw * (iy_se - iy) * cot;
-                    giy -= I_nw * (ix_se - ix) * cot;
-                }
-                if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
-                    int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);
+              T I_sw = x[offset];
+              gix -= I_sw * (iy - iy_ne) * cot;
+              giy += I_sw * (ix_ne - ix) * cot;
+          }
+          if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
+              int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);

-                    T I_ne = x[offset];
-                    gix += I_ne * (iy_sw - iy) * cot;
-                    giy -= I_ne * (ix - ix_sw) * cot;
-                }
-                if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
-                    int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);
+              T I_se = x[offset];
+              gix += I_se * (iy - iy_nw) * cot;
+              giy += I_se * (ix - ix_nw) * cot;
+          }
+      }

-                    T I_sw = x[offset];
-                    gix -= I_sw * (iy - iy_ne) * cot;
-                    giy += I_sw * (ix_ne - ix) * cot;
-                }
-                if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
-                    int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);
+      T gix_mult = W / 2;
+      T giy_mult = H / 2;

-                    T I_se = x[offset];
-                    gix += I_se * (iy - iy_nw) * cot;
-                    giy += I_se * (ix - ix_nw) * cot;
-                }
-            }
+      // Reduce across each simdgroup first.
+      // This is much faster than relying purely on atomics.
+      gix = simd_sum(gix);
+      giy = simd_sum(giy);

-            T gix_mult = W / 2;
-            T giy_mult = H / 2;
+      if (thread_index_in_simdgroup == 0) {
+          atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
+          atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
+      }
+  """
+  kernel = mx.fast.metal_kernel(
+      name="grid_sample_grad",
+      input_names=["x", "grid", "cotangent"],
+      output_names=["x_grad", "grid_grad"],
+      source=source,
+      atomic_outputs=True,
+  )

-            // Reduce across each simdgroup first.
-            // This is much faster than relying purely on atomics.
-            gix = simd_sum(gix);
-            giy = simd_sum(giy);
+  @grid_sample.vjp
+  def grid_sample_vjp(primals, cotangent, _):
+      x, grid = primals
+      B, _, _, C = x.shape
+      _, gN, gM, D = grid.shape

-            if (thread_index_in_simdgroup == 0) {
-                atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
-                atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
-            }
-        """
-        kernel = mx.fast.metal_kernel(
-            name="grid_sample_grad",
-            input_names=["x", "grid", "cotangent"],
-            output_names=["x_grad", "grid_grad"],
-            source=source,
-            atomic_outputs=True,
-        )
-        # pad the output channels to simd group size
-        # so that our `simd_sum`s don't overlap.
-        simdgroup_size = 32
-        C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
-        grid_size = B * gN * gM * C_padded
-        outputs = kernel(
-            inputs=[x, grid, cotangent],
-            template=[("T", x.dtype)],
-            output_shapes=[x.shape, grid.shape],
-            output_dtypes=[x.dtype, x.dtype],
-            grid=(grid_size, 1, 1),
-            threadgroup=(256, 1, 1),
-            init_value=0,
-        )
-        return outputs[0], outputs[1]
+      assert D == 2, "Last dim of `grid` must be size 2."
+
+      # pad the output channels to simd group size
+      # so that our `simd_sum`s don't overlap.
+      simdgroup_size = 32
+      C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
+      grid_size = B * gN * gM * C_padded
+      outputs = kernel(
+          inputs=[x, grid, cotangent],
+          template=[("T", x.dtype)],
+          output_shapes=[x.shape, grid.shape],
+          output_dtypes=[x.dtype, x.dtype],
+          grid=(grid_size, 1, 1),
+          threadgroup=(256, 1, 1),
+          init_value=0,
+      )
+      return outputs[0], outputs[1]

 There's an even larger speed up for the vjp:

--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -22,12 +22,12 @@ You can do that in MLX directly:
 This function performs that operation while leaving the implementation and
 function transformations to MLX.

-However you may need to customize the underlying implementation, perhaps to
-make it faster or for custom differentiation. In this tutorial we will go
-through adding custom extensions. It will cover:
+However, you may want to customize the underlying implementation, perhaps to
+make it faster. In this tutorial we will go through adding custom extensions.
+It will cover:

 * The structure of the MLX library.
-* Implementing a CPU operation that redirects to Accelerate_ when appropriate.
+* Implementing a CPU operation.
 * Implementing a GPU operation using metal.
 * Adding the ``vjp`` and ``jvp`` function transformation.
 * Building a custom extension and binding it to python.
@@ -45,7 +45,7 @@ Operations
 Operations are the front-end functions that operate on arrays. They are defined
 in the C++ API (:ref:`cpp_ops`), and the Python API (:ref:`ops`) binds them.

-We would like an operation, :meth:`axpby` that takes in two arrays ``x`` and
+We would like an operation :meth:`axpby` that takes in two arrays, ``x`` and
 ``y``, and two scalars, ``alpha`` and ``beta``. This is how to define it in
 C++:

@@ -55,7 +55,7 @@ C++:
    *  Scale and sum two vectors element-wise
    *  z = alpha * x + beta * y
    *
-    *  Follow numpy style broadcasting between x and y
+    *  Use NumPy-style broadcasting between x and y
    *  Inputs are upcasted to floats if needed
    **/
    array axpby(
@@ -66,7 +66,7 @@ C++:
        StreamOrDevice s = {} // Stream on which to schedule the operation
    );

-The simplest way to this operation is in terms of existing operations:
+The simplest way to implement this is with existing operations:

 .. code-block:: C++

@@ -93,9 +93,9 @@ Primitives
 ^^^^^^^^^^^

 A :class:`Primitive` is part of the computation graph of an :class:`array`. It
-defines how to create outputs arrays given a input arrays. Further, a
+defines how to create output arrays given input arrays. Further, a
 :class:`Primitive` has methods to run on the CPU or GPU and for function
-transformations such as ``vjp`` and ``jvp``.  Lets go back to our example to be
+transformations such as ``vjp`` and ``jvp``.  Let's go back to our example to be
 more concrete:

 .. code-block:: C++
@@ -128,7 +128,7 @@ more concrete:
        /** The vector-Jacobian product. */
        std::vector<array> vjp(
            const std::vector<array>& primals,
-            const array& cotan,
+            const std::vector<array>& cotangents,
            const std::vector<int>& argnums,
            const std::vector<array>& outputs) override;

@@ -138,13 +138,13 @@ more concrete:
        * representing the vectorized computation and the axis which
        * corresponds to the output vectorized dimension.
        */
-        virtual std::pair<std::vector<array>, std::vector<int>> vmap(
+        std::pair<std::vector<array>, std::vector<int>> vmap(
            const std::vector<array>& inputs,
            const std::vector<int>& axes) override;

-        /** Print the primitive. */
-        void print(std::ostream& os) override {
-            os << "Axpby";
+        /** The name of primitive. */
+        const char* name() const override {
+          return "Axpby";
        }

        /** Equivalence check **/
@@ -153,9 +153,6 @@ more concrete:
      private:
        float alpha_;
        float beta_;
-
-        /** Fall back implementation for evaluation on CPU */
-        void eval(const std::vector<array>& inputs, array& out);
    };

 The :class:`Axpby` class derives from the base :class:`Primitive` class. The
@@ -188,7 +185,7 @@ Let's reimplement our operation now in terms of our :class:`Axpby` primitive.
        auto promoted_dtype = promote_types(x.dtype(), y.dtype());

        // Upcast to float32 for non-floating point inputs x and y
-        auto out_dtype = is_floating_point(promoted_dtype)
+        auto out_dtype = issubdtype(promoted_dtype, float32)
            ? promoted_dtype
            : promote_types(promoted_dtype, float32);

@@ -234,49 +231,57 @@ the execution of the computation graph, and calls :meth:`Axpby::eval_cpu` or
 Implementing the CPU Back-end
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Let's start by implementing a naive and generic version of
-:meth:`Axpby::eval_cpu`. We declared this as a private member function of
-:class:`Axpby` earlier called :meth:`Axpby::eval`.
+Let's start by implementing :meth:`Axpby::eval_cpu`.

-Our naive method will go over each element of the output array, find the
+The method will go over each element of the output array, find the
 corresponding input elements of ``x`` and ``y`` and perform the operation
 point-wise. This is captured in the templated function :meth:`axpby_impl`.

 .. code-block:: C++

-    template <typename T>
-    void axpby_impl(
-            const array& x,
-            const array& y,
-            array& out,
-            float alpha_,
-            float beta_) {
-        // We only allocate memory when we are ready to fill the output
-        // malloc_or_wait synchronously allocates available memory
-        // There may be a wait executed here if the allocation is requested
-        // under memory-pressured conditions
-        out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  template <typename T>
+  void axpby_impl(
+      const mx::array& x,
+      const mx::array& y,
+      mx::array& out,
+      float alpha_,
+      float beta_,
+      mx::Stream stream) {
+    out.set_data(mx::allocator::malloc(out.nbytes()));

-        // Collect input and output data pointers
-        const T* x_ptr = x.data<T>();
-        const T* y_ptr = y.data<T>();
-        T* out_ptr = out.data<T>();
+    // Get the CPU command encoder and register input and output arrays
+    auto& encoder = mx::cpu::get_command_encoder(stream);
+    encoder.set_input_array(x);
+    encoder.set_input_array(y);
+    encoder.set_output_array(out);

-        // Cast alpha and beta to the relevant types
-        T alpha = static_cast<T>(alpha_);
-        T beta = static_cast<T>(beta_);
+    // Launch the CPU kernel
+    encoder.dispatch([x_ptr = x.data<T>(),
+                      y_ptr = y.data<T>(),
+                      out_ptr = out.data<T>(),
+                      size = out.size(),
+                      shape = out.shape(),
+                      x_strides = x.strides(),
+                      y_strides = y.strides(),
+                      alpha_,
+                      beta_]() {

-        // Do the element-wise operation for each output
-        for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
-            // Map linear indices to offsets in x and y
-            auto x_offset = elem_to_loc(out_idx, x.shape(), x.strides());
-            auto y_offset = elem_to_loc(out_idx, y.shape(), y.strides());
+      // Cast alpha and beta to the relevant types
+      T alpha = static_cast<T>(alpha_);
+      T beta = static_cast<T>(beta_);

-            // We allocate the output to be contiguous and regularly strided
-            // (defaults to row major) and hence it doesn't need additional mapping
-            out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
-        }
-    }
+      // Do the element-wise operation for each output
+      for (size_t out_idx = 0; out_idx < size; out_idx++) {
+        // Map linear indices to offsets in x and y
+        auto x_offset = mx::elem_to_loc(out_idx, shape, x_strides);
+        auto y_offset = mx::elem_to_loc(out_idx, shape, y_strides);
+
+        // We allocate the output to be contiguous and regularly strided
+        // (defaults to row major) and hence it doesn't need additional mapping
+        out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
+      }
+    });
+  }

 Our implementation should work for all incoming floating point arrays.
 Accordingly, we add dispatches for ``float32``, ``float16``, ``bfloat16`` and
@@ -284,112 +289,32 @@ Accordingly, we add dispatches for ``float32``, ``float16``, ``bfloat16`` and

 .. code-block:: C++

-    /** Fall back implementation for evaluation on CPU */
-    void Axpby::eval(
-      const std::vector<array>& inputs,
-      const std::vector<array>& outputs) {
-        auto& x = inputs[0];
-        auto& y = inputs[1];
-        auto& out = outputs[0];
-
-        // Dispatch to the correct dtype
-        if (out.dtype() == float32) {
-            return axpby_impl<float>(x, y, out, alpha_, beta_);
-        } else if (out.dtype() == float16) {
-            return axpby_impl<float16_t>(x, y, out, alpha_, beta_);
-        } else if (out.dtype() == bfloat16) {
-            return axpby_impl<bfloat16_t>(x, y, out, alpha_, beta_);
-        } else if (out.dtype() == complex64) {
-            return axpby_impl<complex64_t>(x, y, out, alpha_, beta_);
-        } else {
-            throw std::runtime_error(
-                "[Axpby] Only supports floating point types.");
-        }
-    }
-
-This is good as a fallback implementation. We can use the ``axpby`` routine
-provided by the Accelerate_ framework for a faster implementation in certain
-cases:
-
-#.  Accelerate does not provide implementations of ``axpby`` for half precision
-    floats. We can only use it for ``float32`` types.
-#.  Accelerate assumes the inputs ``x`` and ``y`` are contiguous and all
-    elements have fixed strides between them. We only direct to Accelerate
-    if both ``x`` and ``y`` are row contiguous or column contiguous.
-#.  Accelerate performs the routine ``Y = (alpha * X) + (beta * Y)`` in-place.
-    MLX expects to write the output to a new array. We must copy the elements
-    of ``y`` into the output and use that as an input to ``axpby``.
-
-Let's write an implementation that uses Accelerate in the right conditions.
-It allocates data for the output, copies ``y`` into it, and then calls the
-:func:`catlas_saxpby` from accelerate.
-
-.. code-block:: C++
-
-    template <typename T>
-    void axpby_impl_accelerate(
-            const array& x,
-            const array& y,
-            array& out,
-            float alpha_,
-            float beta_) {
-        // Accelerate library provides catlas_saxpby which does
-        // Y = (alpha * X) + (beta * Y) in place
-        // To use it, we first copy the data in y over to the output array
-        out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-        // We then copy over the elements using the contiguous vector specialization
-        copy_inplace(y, out, CopyType::Vector);
-
-        // Get x and y pointers for catlas_saxpby
-        const T* x_ptr = x.data<T>();
-        T* y_ptr = out.data<T>();
-
-        T alpha = static_cast<T>(alpha_);
-        T beta = static_cast<T>(beta_);
-
-        // Call the inplace accelerate operator
-        catlas_saxpby(
-            /* N = */ out.size(),
-            /* ALPHA = */ alpha,
-            /* X = */ x_ptr,
-            /* INCX = */ 1,
-            /* BETA = */ beta,
-            /* Y = */ y_ptr,
-            /* INCY = */ 1);
-    }
-
-For inputs that do not fit the criteria for accelerate, we fall back to
-:meth:`Axpby::eval`. With this in mind, let's finish our
-:meth:`Axpby::eval_cpu`.
-
-.. code-block:: C++
-
-    /** Evaluate primitive on CPU using accelerate specializations */
    void Axpby::eval_cpu(
-      const std::vector<array>& inputs,
-      const std::vector<array>& outputs) {
-        assert(inputs.size() == 2);
-        auto& x = inputs[0];
-        auto& y = inputs[1];
-        auto& out = outputs[0];
+        const std::vector<mx::array>& inputs,
+        std::vector<mx::array>& outputs) {
+      auto& x = inputs[0];
+      auto& y = inputs[1];
+      auto& out = outputs[0];

-        // Accelerate specialization for contiguous single precision float arrays
-        if (out.dtype() == float32 &&
-            ((x.flags().row_contiguous && y.flags().row_contiguous) ||
-            (x.flags().col_contiguous && y.flags().col_contiguous))) {
-            axpby_impl_accelerate<float>(x, y, out, alpha_, beta_);
-            return;
-        }
-
-        // Fall back to common back-end if specializations are not available
-        eval(inputs, outputs);
+      // Dispatch to the correct dtype
+      if (out.dtype() == mx::float32) {
+        return axpby_impl<float>(x, y, out, alpha_, beta_, stream());
+      } else if (out.dtype() == mx::float16) {
+        return axpby_impl<mx::float16_t>(x, y, out, alpha_, beta_, stream());
+      } else if (out.dtype() == mx::bfloat16) {
+        return axpby_impl<mx::bfloat16_t>(x, y, out, alpha_, beta_, stream());
+      } else if (out.dtype() == mx::complex64) {
+        return axpby_impl<mx::complex64_t>(x, y, out, alpha_, beta_, stream());
+      } else {
+        throw std::runtime_error(
+            "Axpby is only supported for floating point types.");
+      }
    }

 Just this much is enough to run the operation :meth:`axpby` on a CPU stream! If
 you do not plan on running the operation on the GPU or using transforms on
 computation graphs that contain :class:`Axpby`, you can stop implementing the
-primitive here and enjoy the speed-ups you get from the Accelerate library.
+primitive here.

 Implementing the GPU Back-end
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -466,17 +391,17 @@ below.
        auto& d = metal::device(s.device);

        // Allocate output memory
-        out.set_data(allocator::malloc_or_wait(out.nbytes()));
+        out.set_data(allocator::malloc(out.nbytes()));

        // Resolve name of kernel
-        std::ostringstream kname;
-        kname << "axpby_" << "general_" << type_to_name(out);
+        std::stream kname;
+        kname = "axpby_general_" + type_to_name(out);

-        // Make sure the metal library is available
-        d.register_library("mlx_ext");
+        // Load the metal library
+        auto lib = d.get_library("mlx_ext", current_binary_dir());

        // Make a kernel from this metal library
-        auto kernel = d.get_kernel(kname.str(), "mlx_ext");
+        auto kernel = d.get_kernel(kname, lib);

        // Prepare to encode kernel
        auto& compute_encoder = d.get_command_encoder(s.index);
@@ -544,7 +469,7 @@ one we just defined:
            const std::vector<array>& tangents,
            const std::vector<int>& argnums) {
        // Forward mode diff that pushes along the tangents
-        // The jvp transform on the primitive can built with ops
+        // The jvp transform on the primitive can be built with ops
        // that are scheduled on the same stream as the primitive

        // If argnums = {0}, we only push along x in which case the
@@ -556,7 +481,7 @@ one we just defined:
            auto scale_arr = array(scale, tangents[0].dtype());
            return {multiply(scale_arr, tangents[0], stream())};
        }
-        // If, argnums = {0, 1}, we take contributions from both
+        // If argnums = {0, 1}, we take contributions from both
        // which gives us jvp = tangent_x * alpha + tangent_y * beta
        else {
            return {axpby(tangents[0], tangents[1], alpha_, beta_, stream())};
@@ -810,7 +735,7 @@ Let's look at a simple script and its results:

    print(f"c shape: {c.shape}")
    print(f"c dtype: {c.dtype}")
-    print(f"c correct: {mx.all(c == 6.0).item()}")
+    print(f"c is correct: {mx.all(c == 6.0).item()}")

 Output:

@@ -818,13 +743,13 @@ Output:

    c shape: [3, 4]
    c dtype: float32
-    c correctness: True
+    c is correct: True

 Results
 ^^^^^^^

 Let's run a quick benchmark and see how our new ``axpby`` operation compares
-with the naive :meth:`simple_axpby` we first defined on the CPU.
+with the naive :meth:`simple_axpby` we first defined.

 .. code-block:: python

@@ -832,13 +757,11 @@ with the naive :meth:`simple_axpby` we first defined on the CPU.
    from mlx_sample_extensions import axpby
    import time

-    mx.set_default_device(mx.cpu)
-
    def simple_axpby(x: mx.array, y: mx.array, alpha: float, beta: float) -> mx.array:
        return alpha * x + beta * y

-    M = 256
-    N = 512
+    M = 4096
+    N = 4096

    x = mx.random.normal((M, N))
    y = mx.random.normal((M, N))
@@ -849,24 +772,24 @@ with the naive :meth:`simple_axpby` we first defined on the CPU.

    def bench(f):
        # Warm up
-        for i in range(100):
+        for i in range(5):
            z = f(x, y, alpha, beta)
            mx.eval(z)

        # Timed run
        s = time.time()
-        for i in range(5000):
+        for i in range(100):
            z = f(x, y, alpha, beta)
            mx.eval(z)
        e = time.time()
-        return e - s
+        return 1000 * (e - s) / 100

    simple_time = bench(simple_axpby)
    custom_time = bench(axpby)

-    print(f"Simple axpby: {simple_time:.3f} s | Custom axpby: {custom_time:.3f} s")
+    print(f"Simple axpby: {simple_time:.3f} ms | Custom axpby: {custom_time:.3f} ms")

-The results are ``Simple axpby: 0.114 s | Custom axpby: 0.109 s``. We see
+The results are ``Simple axpby: 1.559 ms | Custom axpby: 0.774 ms``. We see
 modest improvements right away!

 This operation is now good to be used to build other operations, in
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -70,6 +70,7 @@ are the CPU and GPU.
   python/fft
   python/linalg
   python/metal
+   python/memory_management
   python/nn
   python/optimizers
   python/distributed
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -13,7 +13,7 @@ silicon computer is

    pip install mlx

-To install from PyPI you must meet the following requirements:
+To install from PyPI your system must meet the following requirements:

 - Using an M series chip (Apple silicon)
 - Using a native Python >= 3.9
@@ -23,12 +23,39 @@ To install from PyPI you must meet the following requirements:
    MLX is only available on devices running macOS >= 13.5
    It is highly recommended to use macOS 14 (Sonoma)

+CUDA
+^^^^

-MLX is also available on conda-forge. To install MLX with conda do:
+MLX has a CUDA backend which you can install with:

 .. code-block:: shell

-   conda install conda-forge::mlx
+    pip install mlx[cuda]
+
+To install the CUDA package from PyPi your system must meet the following
+requirements:
+
+- Nvidia architecture >= SM 7.0 (Volta)
+- Nvidia driver >= 550.54.14
+- CUDA toolkit >= 12.0
+- Linux distribution with glibc >= 2.35
+- Python >= 3.9
+
+
+CPU-only (Linux)
+^^^^^^^^^^^^^^^^
+
+For a CPU-only version of MLX that runs on Linux use:
+
+.. code-block:: shell
+
+    pip install mlx[cpu]
+
+To install the CPU-only package from PyPi your system must meet the following
+requirements:
+
+- Linux distribution with glibc >= 2.35
+- Python >= 3.9


 Troubleshooting
@@ -65,6 +92,8 @@ Build Requirements
 Python API
 ^^^^^^^^^^

+.. _python install:
+
 To build and install the MLX python library from source, first, clone MLX from
 `its GitHub repo <https://github.com/ml-explore/mlx>`_:

@@ -76,20 +105,20 @@ Then simply build and install MLX using pip:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install .
+  pip install .

 For developing, install the package with development dependencies, and use an
 editable install:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -e ".[dev]"
+  pip install -e ".[dev]"

 Once the development dependencies are installed, you can build faster with:

 .. code-block:: shell

- CMAKE_BUILD_PARALLEL_LEVEL=8 python setup.py build_ext --inplace
+ python setup.py build_ext --inplace

 Run the tests with:

@@ -107,6 +136,8 @@ IDE:
 C++ API
 ^^^^^^^

+.. _cpp install:
+
 Currently, MLX must be built and installed from source.

 Similarly to the python library, to build and install the MLX C++ library start
@@ -185,6 +216,7 @@ should point to the path to the built metal library.

      xcrun -sdk macosx --show-sdk-version

+
 Binary Size Minimization
 ~~~~~~~~~~~~~~~~~~~~~~~~

@@ -213,6 +245,50 @@ be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
 Metal kernel cache persists across reboots.

+Linux
+^^^^^
+
+To build from source on Linux (CPU only), install the BLAS and LAPACK headers.
+For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   apt-get update -y
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+From here follow the instructions to install either the :ref:`Python <python
+install>` or :ref:`C++ <cpp install>` APIs.
+
+CUDA
+^^^^
+
+To build from source on Linux with CUDA, install the BLAS and LAPACK headers
+and the CUDA toolkit. For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+   dpkg -i cuda-keyring_1.1-1_all.deb
+   apt-get update -y
+   apt-get -y install cuda-toolkit-12-9
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+
+When building either the Python or C++ APIs make sure to pass the cmake flag
+``MLX_BUILD_CUDA=ON``. For example, to build the Python API run:
+
+.. code-block:: shell
+
+  CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"
+
+To build the C++ package run:
+
+.. code-block:: shell
+
+   mkdir -p build && cd build
+   cmake .. -DMLX_BUILD_CUDA=ON && make -j
+
+
 Troubleshooting
 ^^^^^^^^^^^^^^^

--- a/docs/src/python/array.rst
+++ b/docs/src/python/array.rst
@@ -19,6 +19,8 @@ Array
    array.ndim
    array.shape
    array.size
+    array.real
+    array.imag
    array.abs
    array.all
    array.any
@@ -38,6 +40,7 @@ Array
    array.log10
    array.log1p
    array.log2
+    array.logcumsumexp
    array.logsumexp
    array.max
    array.mean
--- a/docs/src/python/fft.rst
+++ b/docs/src/python/fft.rst
@@ -20,3 +20,5 @@ FFT
  irfft2
  rfftn
  irfftn
+  fftshift
+  ifftshift
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -16,9 +16,12 @@ Linear Algebra
    cross
    qr
    svd
+    eigvals
+    eig
    eigvalsh
    eigh
    lu
    lu_factor
+    pinv
    solve
    solve_triangular
--- a/docs/src/python/memory_management.rst
+++ b/docs/src/python/memory_management.rst
@@ -0,0 +1,16 @@
+Memory Management
+=================
+
+.. currentmodule:: mlx.core
+
+.. autosummary::
+  :toctree: _autosummary
+
+  get_active_memory
+  get_peak_memory
+  reset_peak_memory
+  get_cache_memory
+  set_memory_limit
+  set_cache_limit
+  set_wired_limit
+  clear_cache
--- a/docs/src/python/metal.rst
+++ b/docs/src/python/metal.rst
@@ -8,13 +8,5 @@ Metal

  is_available
  device_info
-  get_active_memory
-  get_peak_memory
-  reset_peak_memory
-  get_cache_memory
-  set_memory_limit
-  set_cache_limit
-  set_wired_limit
-  clear_cache
  start_capture
  stop_capture
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -36,10 +36,12 @@ Operations
   bitwise_or
   bitwise_xor
   block_masked_mm
+   broadcast_arrays
   broadcast_to
   ceil
   clip
   concatenate
+   contiguous
   conj
   conjugate
   convolve
@@ -101,6 +103,7 @@ Operations
   log10
   log1p
   logaddexp
+   logcumsumexp
   logical_not
   logical_and
   logical_or
--- a/docs/src/python/optimizers/common_optimizers.rst
+++ b/docs/src/python/optimizers/common_optimizers.rst
@@ -18,3 +18,5 @@ Common Optimizers
   AdamW
   Adamax
   Lion
+   MultiOptimizer
+   Muon
--- a/docs/src/python/transforms.rst
+++ b/docs/src/python/transforms.rst
@@ -9,6 +9,7 @@ Transforms
  :toctree: _autosummary

   eval
+   async_eval
   compile
   custom_function
   disable_compile
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -107,6 +107,16 @@ same array:
  >>> a
  array([1, 2, 0], dtype=int32)

+
+Note, unlike NumPy, updates to the same location are nondeterministic:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> a[[0, 0]] = mx.array([4, 5])
+
+The first element of ``a`` could be ``4`` or ``5``.
+
 Transformations of functions which use in-place updates are allowed and work as
 expected. For example:

--- a/examples/extensions/CMakeLists.txt
+++ b/examples/extensions/CMakeLists.txt
@@ -10,7 +10,6 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 option(BUILD_SHARED_LIBS "Build extensions as a shared library" ON)

 # ----------------------------- Dependencies -----------------------------
-find_package(MLX CONFIG REQUIRED)
 find_package(
  Python 3.8
  COMPONENTS Interpreter Development.Module
@@ -21,6 +20,12 @@ execute_process(
  OUTPUT_VARIABLE nanobind_ROOT)
 find_package(nanobind CONFIG REQUIRED)

+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE MLX_ROOT)
+find_package(MLX CONFIG REQUIRED)
+
 # ----------------------------- Extensions -----------------------------

 # Add library
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -1,20 +1,15 @@
-// Copyright © 2023-2024 Apple Inc.
+// Copyright © 2023-2025 Apple Inc.

-#include <cassert>
+#include <dlfcn.h>
 #include <iostream>
 #include <sstream>

-#include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/utils.h"

 #include "axpby/axpby.h"

-#ifdef ACCELERATE_NEW_LAPACK
-#include <vecLib/cblas_new.h>
-#endif
-
 #ifdef _METAL_
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/utils.h"
@@ -22,6 +17,19 @@

 namespace my_ext {

+// A helper function to find the location of the current binary on disk.
+// The Metal library ("mlx_ext.mtllib"), should be in the same directory.
+std::string current_binary_dir() {
+  static std::string binary_dir = []() {
+    Dl_info info;
+    if (!dladdr(reinterpret_cast<void*>(&current_binary_dir), &info)) {
+      throw std::runtime_error("Unable to get current binary dir.");
+    }
+    return std::filesystem::path(info.dli_fname).parent_path().string();
+  }();
+  return binary_dir;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Operation Implementation
 ///////////////////////////////////////////////////////////////////////////////
@@ -76,136 +84,65 @@ void axpby_impl(
    const mx::array& y,
    mx::array& out,
    float alpha_,
-    float beta_) {
-  // We only allocate memory when we are ready to fill the output
-  // malloc_or_wait synchronously allocates available memory
-  // There may be a wait executed here if the allocation is requested
-  // under memory-pressured conditions
-  out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));
+    float beta_,
+    mx::Stream stream) {
+  out.set_data(mx::allocator::malloc(out.nbytes()));

-  // Collect input and output data pointers
-  const T* x_ptr = x.data<T>();
-  const T* y_ptr = y.data<T>();
-  T* out_ptr = out.data<T>();
+  // Get the CPU command encoder and register input and output arrays
+  auto& encoder = mx::cpu::get_command_encoder(stream);
+  encoder.set_input_array(x);
+  encoder.set_input_array(y);
+  encoder.set_output_array(out);

-  // Cast alpha and beta to the relevant types
-  T alpha = static_cast<T>(alpha_);
-  T beta = static_cast<T>(beta_);
+  // Launch the CPU kernel
+  encoder.dispatch([x_ptr = x.data<T>(),
+                    y_ptr = y.data<T>(),
+                    out_ptr = out.data<T>(),
+                    size = out.size(),
+                    shape = out.shape(),
+                    x_strides = x.strides(),
+                    y_strides = y.strides(),
+                    alpha_,
+                    beta_]() {
+    // Cast alpha and beta to the relevant types
+    T alpha = static_cast<T>(alpha_);
+    T beta = static_cast<T>(beta_);

-  // Do the element-wise operation for each output
-  for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
-    // Map linear indices to offsets in x and y
-    auto x_offset = mx::elem_to_loc(out_idx, x.shape(), x.strides());
-    auto y_offset = mx::elem_to_loc(out_idx, y.shape(), y.strides());
+    // Do the element-wise operation for each output
+    for (size_t out_idx = 0; out_idx < size; out_idx++) {
+      // Map linear indices to offsets in x and y
+      auto x_offset = mx::elem_to_loc(out_idx, shape, x_strides);
+      auto y_offset = mx::elem_to_loc(out_idx, shape, y_strides);

-    // We allocate the output to be contiguous and regularly strided
-    // (defaults to row major) and hence it doesn't need additional mapping
-    out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
-  }
+      // We allocate the output to be contiguous and regularly strided
+      // (defaults to row major) and hence it doesn't need additional mapping
+      out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
+    }
+  });
 }

-/** Fall back implementation for evaluation on CPU */
-void Axpby::eval(
+void Axpby::eval_cpu(
    const std::vector<mx::array>& inputs,
    std::vector<mx::array>& outputs) {
-  // Check the inputs (registered in the op while constructing the out array)
-  assert(inputs.size() == 2);
  auto& x = inputs[0];
  auto& y = inputs[1];
  auto& out = outputs[0];

  // Dispatch to the correct dtype
  if (out.dtype() == mx::float32) {
-    return axpby_impl<float>(x, y, out, alpha_, beta_);
+    return axpby_impl<float>(x, y, out, alpha_, beta_, stream());
  } else if (out.dtype() == mx::float16) {
-    return axpby_impl<mx::float16_t>(x, y, out, alpha_, beta_);
+    return axpby_impl<mx::float16_t>(x, y, out, alpha_, beta_, stream());
  } else if (out.dtype() == mx::bfloat16) {
-    return axpby_impl<mx::bfloat16_t>(x, y, out, alpha_, beta_);
+    return axpby_impl<mx::bfloat16_t>(x, y, out, alpha_, beta_, stream());
  } else if (out.dtype() == mx::complex64) {
-    return axpby_impl<mx::complex64_t>(x, y, out, alpha_, beta_);
+    return axpby_impl<mx::complex64_t>(x, y, out, alpha_, beta_, stream());
  } else {
    throw std::runtime_error(
        "Axpby is only supported for floating point types.");
  }
 }

-///////////////////////////////////////////////////////////////////////////////
-// Primitive Accelerate Backend Implementation
-///////////////////////////////////////////////////////////////////////////////
-
-#ifdef ACCELERATE_NEW_LAPACK
-
-template <typename T>
-void axpby_impl_accelerate(
-    const mx::array& x,
-    const mx::array& y,
-    mx::array& out,
-    float alpha_,
-    float beta_) {
-  // Accelerate library provides catlas_saxpby which does
-  // Y = (alpha * X) + (beta * Y) in place
-  // To use it, we first copy the data in y over to the output array
-
-  // This specialization requires both x and y be contiguous in the same mode
-  // i.e: corresponding linear indices in both point to corresponding elements
-  // The data in the output array is allocated to match the strides in y
-  // such that x, y, and out are contiguous in the same mode and
-  // no transposition is needed
-  out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));
-
-  // We then copy over the elements using the contiguous vector specialization
-  copy_inplace(y, out, mx::CopyType::Vector);
-
-  // Get x and y pointers for catlas_saxpby
-  const T* x_ptr = x.data<T>();
-  T* y_ptr = out.data<T>();
-
-  T alpha = static_cast<T>(alpha_);
-  T beta = static_cast<T>(beta_);
-
-  // Call the inplace accelerate operator
-  catlas_saxpby(
-      /* N = */ out.size(),
-      /* ALPHA = */ alpha,
-      /* X = */ x_ptr,
-      /* INCX = */ 1,
-      /* BETA = */ beta,
-      /* Y = */ y_ptr,
-      /* INCY = */ 1);
-}
-
-/** Evaluate primitive on CPU using accelerate specializations */
-void Axpby::eval_cpu(
-    const std::vector<mx::array>& inputs,
-    std::vector<mx::array>& outputs) {
-  assert(inputs.size() == 2);
-  auto& x = inputs[0];
-  auto& y = inputs[1];
-  auto& out = outputs[0];
-
-  // Accelerate specialization for contiguous single precision float arrays
-  if (out.dtype() == mx::float32 &&
-      ((x.flags().row_contiguous && y.flags().row_contiguous) ||
-       (x.flags().col_contiguous && y.flags().col_contiguous))) {
-    axpby_impl_accelerate<float>(x, y, out, alpha_, beta_);
-    return;
-  }
-
-  // Fall back to common backend if specializations are not available
-  eval(inputs, outputs);
-}
-
-#else // Accelerate not available
-
-/** Evaluate primitive on CPU falling back to common backend */
-void Axpby::eval_cpu(
-    const std::vector<mx::array>& inputs,
-    std::vector<mx::array>& outputs) {
-  eval(inputs, outputs);
-}
-
-#endif
-
 ///////////////////////////////////////////////////////////////////////////////
 // Primitive Metal Backend Implementation
 ///////////////////////////////////////////////////////////////////////////////
@@ -217,7 +154,6 @@ void Axpby::eval_gpu(
    const std::vector<mx::array>& inputs,
    std::vector<mx::array>& outputs) {
  // Prepare inputs
-  assert(inputs.size() == 2);
  auto& x = inputs[0];
  auto& y = inputs[1];
  auto& out = outputs[0];
@@ -236,25 +172,24 @@ void Axpby::eval_gpu(
  // Allocate output memory with strides based on specialization
  if (contiguous_kernel) {
    out.set_data(
-        mx::allocator::malloc_or_wait(x.data_size() * out.itemsize()),
+        mx::allocator::malloc(x.data_size() * out.itemsize()),
        x.data_size(),
        x.strides(),
        x.flags());
  } else {
-    out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));
+    out.set_data(mx::allocator::malloc(out.nbytes()));
  }

  // Resolve name of kernel (corresponds to axpby.metal)
-  std::ostringstream kname;
-  kname << "axpby_";
-  kname << (contiguous_kernel ? "contiguous_" : "general_");
-  kname << type_to_name(out);
+  std::string kname = "axpby_";
+  kname += (contiguous_kernel ? "contiguous_" : "general_");
+  kname += type_to_name(out);

-  // Make sure the metal library is available
-  d.register_library("mlx_ext");
+  // Load the metal library
+  auto lib = d.get_library("mlx_ext", current_binary_dir());

  // Make a kernel from this metal library
-  auto kernel = d.get_kernel(kname.str(), "mlx_ext");
+  auto kernel = d.get_kernel(kname, lib);

  // Prepare to encode kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
--- a/examples/extensions/axpby/axpby.h
+++ b/examples/extensions/axpby/axpby.h
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2025 Apple Inc.

 #pragma once

@@ -74,9 +74,9 @@ class Axpby : public mx::Primitive {
      const std::vector<mx::array>& inputs,
      const std::vector<int>& axes) override;

-  /** Print the primitive. */
-  void print(std::ostream& os) override {
-    os << "Axpby";
+  /** The name of primitive. */
+  const char* name() const override {
+    return "Axpby";
  }

  /** Equivalence check **/
@@ -85,11 +85,6 @@ class Axpby : public mx::Primitive {
 private:
  float alpha_;
  float beta_;
-
-  /** Fall back implementation for evaluation on CPU */
-  void eval(
-      const std::vector<mx::array>& inputs,
-      std::vector<mx::array>& outputs);
 };

 } // namespace my_ext
--- a/examples/extensions/axpby/axpby.metal
+++ b/examples/extensions/axpby/axpby.metal
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2025 Apple Inc.

 #include <metal_stdlib>

--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
 cmake>=3.25
 mlx>=0.21.0
-nanobind==2.2.0
+nanobind==2.4.0
--- a/examples/extensions/test.py
+++ b/examples/extensions/test.py
@@ -3,8 +3,10 @@ from mlx_sample_extensions import axpby

 a = mx.ones((3, 4))
 b = mx.ones((3, 4))
-c = axpby(a, b, 4.0, 2.0, stream=mx.cpu)
+c_cpu = axpby(a, b, 4.0, 2.0, stream=mx.cpu)
+c_gpu = axpby(a, b, 4.0, 2.0, stream=mx.gpu)

-print(f"c shape: {c.shape}")
-print(f"c dtype: {c.dtype}")
-print(f"c correct: {mx.all(c == 6.0).item()}")
+print(f"c shape: {c_cpu.shape}")
+print(f"c dtype: {c_cpu.dtype}")
+print(f"c_cpu correct: {mx.all(c_cpu == 6.0).item()}")
+print(f"c_gpu correct: {mx.all(c_gpu == 6.0).item()}")
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -5,6 +5,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/dtype_utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/export.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
@@ -17,9 +18,13 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/transforms.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/linalg.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h)

+# Define MLX_VERSION only in the version.cpp file.
+add_library(mlx_version OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp)
+target_compile_definitions(mlx_version PRIVATE MLX_VERSION="${MLX_VERSION}")
+target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:mlx_version>)
+
 if(MSVC)
  # Disable some MSVC warnings to speed up compilation.
  target_compile_options(mlx PUBLIC /wd4068 /wd4244 /wd4267 /wd4804)
@@ -44,5 +49,19 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
 if(MLX_BUILD_METAL)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/metal)
 else()
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_metal)
+  target_sources(mlx
+                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/no_metal.cpp)
+endif()
+
+if(MLX_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda)
+else()
+  target_sources(mlx
+                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda/no_cuda.cpp)
+endif()
+
+if(MLX_BUILD_METAL OR MLX_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/gpu)
+else()
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_gpu)
 endif()
--- a/mlx/allocator.cpp
+++ b/mlx/allocator.cpp
@@ -4,12 +4,11 @@
 #include <sstream>

 #include "mlx/allocator.h"
-#include "mlx/scheduler.h"

 namespace mlx::core::allocator {

 Buffer malloc(size_t size) {
-  auto buffer = allocator().malloc(size, /* allow_swap */ true);
+  auto buffer = allocator().malloc(size);
  if (size && !buffer.ptr()) {
    std::ostringstream msg;
    msg << "[malloc] Unable to allocate " << size << " bytes.";
@@ -22,45 +21,4 @@ void free(Buffer buffer) {
  allocator().free(buffer);
 }

-Buffer CommonAllocator::malloc(size_t size, bool) {
-  void* ptr = std::malloc(size + sizeof(size_t));
-  if (ptr != nullptr) {
-    *static_cast<size_t*>(ptr) = size;
-  }
-  return Buffer{ptr};
-}
-
-void CommonAllocator::free(Buffer buffer) {
-  std::free(buffer.ptr());
-}
-
-size_t CommonAllocator::size(Buffer buffer) const {
-  if (buffer.ptr() == nullptr) {
-    return 0;
-  }
-  return *static_cast<size_t*>(buffer.ptr());
-}
-
-Buffer malloc_or_wait(size_t size) {
-  auto buffer = allocator().malloc(size);
-
-  while (size && !buffer.ptr() && scheduler::n_active_tasks() > 0) {
-    scheduler::wait_for_one();
-    buffer = allocator().malloc(size);
-  }
-
-  // Try swapping if needed
-  if (size && !buffer.ptr()) {
-    buffer = allocator().malloc(size, /* allow_swap = */ true);
-  }
-
-  if (size && !buffer.ptr()) {
-    std::ostringstream msg;
-    msg << "[malloc_or_wait] Unable to allocate " << size << " bytes.";
-    throw std::runtime_error(msg.str());
-  }
-
-  return buffer;
-}
-
 } // namespace mlx::core::allocator
--- a/mlx/allocator.h
+++ b/mlx/allocator.h
@@ -32,14 +32,10 @@ Buffer malloc(size_t size);

 void free(Buffer buffer);

-// Wait for running tasks to finish and free up memory
-// if allocation fails
-Buffer malloc_or_wait(size_t size);
-
 class Allocator {
  /** Abstract base class for a memory allocator. */
 public:
-  virtual Buffer malloc(size_t size, bool allow_swap = false) = 0;
+  virtual Buffer malloc(size_t size) = 0;
  virtual void free(Buffer buffer) = 0;
  virtual size_t size(Buffer buffer) const = 0;

@@ -53,16 +49,4 @@ class Allocator {

 Allocator& allocator();

-class CommonAllocator : public Allocator {
-  /** A general CPU allocator. */
- public:
-  virtual Buffer malloc(size_t size, bool allow_swap = false) override;
-  virtual void free(Buffer buffer) override;
-  virtual size_t size(Buffer buffer) const override;
-
- private:
-  CommonAllocator() = default;
-  friend Allocator& allocator();
-};
-
 } // namespace mlx::core::allocator
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -56,6 +56,18 @@ std::vector<array> array::make_arrays(
  return outputs;
 }

+array array::unsafe_weak_copy(const array& other) {
+  auto cpy = array(other.shape(), other.dtype(), nullptr, {});
+  cpy.set_data(
+      other.buffer(),
+      other.data_size(),
+      other.strides(),
+      other.flags(),
+      [](auto) {});
+  cpy.array_desc_->data_ptr = other.array_desc_->data_ptr;
+  return cpy;
+}
+
 array::array(std::initializer_list<float> data)
    : array_desc_(std::make_shared<ArrayDesc>(
          Shape{static_cast<ShapeElem>(data.size())},
@@ -76,35 +88,27 @@ array::array(allocator::Buffer data, Shape shape, Dtype dtype, Deleter deleter)
  set_data(data, deleter);
 }

-array::array(
-    allocator::Buffer data,
-    Shape shape,
-    Dtype dtype,
-    Strides strides,
-    size_t data_size,
-    Flags flags,
-    Deleter deleter)
-    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
-  set_data(data, data_size, std::move(strides), flags, deleter);
-}
-
 void array::detach() {
+  array_desc_->primitive = nullptr;
+  for (auto& s : array_desc_->siblings) {
+    s.array_desc_->primitive = nullptr;
+  }
  for (auto& s : array_desc_->siblings) {
    s.array_desc_->inputs.clear();
    s.array_desc_->siblings.clear();
    s.array_desc_->position = 0;
-    s.array_desc_->primitive = nullptr;
  }
  array_desc_->inputs.clear();
  array_desc_->siblings.clear();
  array_desc_->position = 0;
-  array_desc_->primitive = nullptr;
 }

 bool array::is_available() const {
  if (status() == Status::available) {
    return true;
-  } else if (status() == Status::evaluated && event().is_signaled()) {
+  } else if (
+      status() == Status::evaluated &&
+      (!event().valid() || event().is_signaled())) {
    set_status(Status::available);
    return true;
  }
@@ -113,7 +117,10 @@ bool array::is_available() const {

 void array::wait() {
  if (!is_available()) {
-    event().wait();
+    if (event().valid()) {
+      event().wait();
+      detach_event();
+    }
    set_status(Status::available);
  }
 }
@@ -174,34 +181,13 @@ void array::copy_shared_buffer(const array& other) {
  copy_shared_buffer(other, other.strides(), other.flags(), other.data_size());
 }

-void array::move_shared_buffer(
-    array other,
-    const Strides& strides,
-    Flags flags,
-    size_t data_size,
-    size_t offset /* = 0 */) {
-  array_desc_->data = std::move(other.array_desc_->data);
-  array_desc_->strides = strides;
-  array_desc_->flags = flags;
-  array_desc_->data_size = data_size;
-  auto char_offset = sizeof(char) * itemsize() * offset;
-  auto data_ptr = other.array_desc_->data_ptr;
-  other.array_desc_->data_ptr = nullptr;
-  array_desc_->data_ptr =
-      static_cast<void*>(static_cast<char*>(data_ptr) + char_offset);
-}
-
-void array::move_shared_buffer(array other) {
-  move_shared_buffer(other, other.strides(), other.flags(), other.data_size());
-}
-
 array::~array() {
  if (array_desc_ == nullptr) {
    return;
  }

-  // Ignore arrays that might be detached during eval
-  if (status() == array::Status::scheduled) {
+  // Detached/detaching
+  if (array_desc_->primitive == nullptr) {
    return;
  }

--- a/mlx/array.h
+++ b/mlx/array.h
@@ -10,6 +10,7 @@
 #include "mlx/allocator.h"
 #include "mlx/dtype.h"
 #include "mlx/event.h"
+#include "mlx/small_vector.h"

 namespace mlx::core {

@@ -18,8 +19,8 @@ class Primitive;

 using Deleter = std::function<void(allocator::Buffer)>;
 using ShapeElem = int32_t;
-using Shape = std::vector<ShapeElem>;
-using Strides = std::vector<int64_t>;
+using Shape = SmallVector<ShapeElem>;
+using Strides = SmallVector<int64_t>;

 class array {
  /* An array is really a node in a graph. It contains a shared ArrayDesc
@@ -199,6 +200,13 @@ class array {
      const std::shared_ptr<Primitive>& primitive,
      const std::vector<array>& inputs);

+  /**
+   * Get a new array that refers to the same data as the input but with a
+   * non-owning pointer to it. Note the array is detached from the graph and has
+   * no inputs, siblings or primitive.
+   */
+  static array unsafe_weak_copy(const array& other);
+
  /** A unique identifier for an array. */
  std::uintptr_t id() const {
    return reinterpret_cast<std::uintptr_t>(array_desc_.get());
@@ -217,6 +225,10 @@ class array {
    // Not copyable
    Data(const Data& d) = delete;
    Data& operator=(const Data& d) = delete;
+    Data(Data&& o) : buffer(o.buffer), d(o.d) {
+      o.buffer = allocator::Buffer(nullptr);
+      o.d = [](allocator::Buffer) {};
+    }
    ~Data() {
      d(buffer);
    }
@@ -243,18 +255,6 @@ class array {
    bool col_contiguous : 1;
  };

-  /** Build an array from all the info held by the array description. Including
-   * the buffer, strides, flags.
-   */
-  explicit array(
-      allocator::Buffer data,
-      Shape shape,
-      Dtype dtype,
-      Strides strides,
-      size_t data_size,
-      Flags flags,
-      Deleter deleter = allocator::free);
-
  /** The array's primitive. */
  Primitive& primitive() const {
    return *(array_desc_->primitive);
@@ -344,11 +344,11 @@ class array {
    return allocator::allocator().size(buffer());
  }

-  // Return a copy of the shared pointer
-  // to the array::Data struct
-  std::shared_ptr<Data> data_shared_ptr() const {
+  // Return the shared pointer to the array::Data struct
+  const std::shared_ptr<Data>& data_shared_ptr() const {
    return array_desc_->data;
  }
+
  // Return a raw pointer to the arrays data
  template <typename T>
  T* data() {
@@ -361,15 +361,10 @@ class array {
  }

  enum Status {
-    // The ouptut of a computation which has not been scheduled.
+    // The output of a computation which has not been scheduled.
    // For example, the status of `x` in `auto x = a + b`.
    unscheduled,

-    // The ouptut of a computation which has been scheduled but `eval_*` has
-    // not yet been called on the array's primitive. A possible
-    // status of `x` in `auto x = a + b; eval(x);`
-    scheduled,
-
    // The array's `eval_*` function has been run, but the computation is not
    // necessarily complete. The array will have memory allocated and if it is
    // not a tracer then it will be detached from the graph.
@@ -406,6 +401,10 @@ class array {
    array_desc_->event = std::move(e);
  }

+  void detach_event() const {
+    array_desc_->event = Event{};
+  }
+
  // Mark the array as a tracer array (true) or not.
  void set_tracer(bool is_tracer) {
    array_desc_->is_tracer = is_tracer;
@@ -431,15 +430,6 @@ class array {

  void copy_shared_buffer(const array& other);

-  void move_shared_buffer(
-      array other,
-      const Strides& strides,
-      Flags flags,
-      size_t data_size,
-      size_t offset = 0);
-
-  void move_shared_buffer(array other);
-
  void overwrite_descriptor(const array& other) {
    array_desc_ = other.array_desc_;
  }
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -1,6 +1,7 @@
 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/broadcasting.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -38,25 +38,20 @@ inline void set_binary_op_output_data(
    const array& a,
    const array& b,
    array& out,
-    BinaryOpType bopt,
-    bool donate_with_move = false) {
+    BinaryOpType bopt) {
  bool b_donatable = is_donatable(b, out);
  bool a_donatable = is_donatable(a, out);
  switch (bopt) {
    case BinaryOpType::ScalarScalar:
      out.set_data(
-          allocator::malloc_or_wait(out.itemsize()), 1, a.strides(), a.flags());
+          allocator::malloc(out.itemsize()), 1, a.strides(), a.flags());
      break;
    case BinaryOpType::ScalarVector:
      if (b_donatable) {
-        if (donate_with_move) {
-          out.move_shared_buffer(b);
-        } else {
-          out.copy_shared_buffer(b);
-        }
+        out.copy_shared_buffer(b);
      } else {
        out.set_data(
-            allocator::malloc_or_wait(b.data_size() * out.itemsize()),
+            allocator::malloc(b.data_size() * out.itemsize()),
            b.data_size(),
            b.strides(),
            b.flags());
@@ -64,14 +59,10 @@ inline void set_binary_op_output_data(
      break;
    case BinaryOpType::VectorScalar:
      if (a_donatable) {
-        if (donate_with_move) {
-          out.move_shared_buffer(a);
-        } else {
-          out.copy_shared_buffer(a);
-        }
+        out.copy_shared_buffer(a);
      } else {
        out.set_data(
-            allocator::malloc_or_wait(a.data_size() * out.itemsize()),
+            allocator::malloc(a.data_size() * out.itemsize()),
            a.data_size(),
            a.strides(),
            a.flags());
@@ -79,20 +70,12 @@ inline void set_binary_op_output_data(
      break;
    case BinaryOpType::VectorVector:
      if (a_donatable) {
-        if (donate_with_move) {
-          out.move_shared_buffer(a);
-        } else {
-          out.copy_shared_buffer(a);
-        }
+        out.copy_shared_buffer(a);
      } else if (b_donatable) {
-        if (donate_with_move) {
-          out.move_shared_buffer(b);
-        } else {
-          out.copy_shared_buffer(b);
-        }
+        out.copy_shared_buffer(b);
      } else {
        out.set_data(
-            allocator::malloc_or_wait(a.data_size() * out.itemsize()),
+            allocator::malloc(a.data_size() * out.itemsize()),
            a.data_size(),
            a.strides(),
            a.flags());
@@ -100,20 +83,12 @@ inline void set_binary_op_output_data(
      break;
    case BinaryOpType::General:
      if (a_donatable && a.flags().row_contiguous && a.size() == out.size()) {
-        if (donate_with_move) {
-          out.move_shared_buffer(a);
-        } else {
-          out.copy_shared_buffer(a);
-        }
+        out.copy_shared_buffer(a);
      } else if (
          b_donatable && b.flags().row_contiguous && b.size() == out.size()) {
-        if (donate_with_move) {
-          out.move_shared_buffer(b);
-        } else {
-          out.copy_shared_buffer(b);
-        }
+        out.copy_shared_buffer(b);
      } else {
-        out.set_data(allocator::malloc_or_wait(out.nbytes()));
+        out.set_data(allocator::malloc(out.nbytes()));
      }
      break;
  }
--- a/mlx/backend/common/broadcasting.cpp
+++ b/mlx/backend/common/broadcasting.cpp
@@ -0,0 +1,24 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+void broadcast(const array& in, array& out) {
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+  Strides strides(out.ndim(), 0);
+  int diff = out.ndim() - in.ndim();
+  for (int i = in.ndim() - 1; i >= 0; --i) {
+    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
+  }
+  auto flags = in.flags();
+  if (out.size() > in.size()) {
+    flags.row_contiguous = flags.col_contiguous = false;
+  }
+  out.copy_shared_buffer(in, strides, flags, in.data_size());
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/broadcasting.h
+++ b/mlx/backend/common/broadcasting.h
@@ -1,10 +1,11 @@
 // Copyright © 2024 Apple Inc.
+
 #pragma once

+#include "mlx/array.h"
+
 namespace mlx::core {

-void encode_wait(Event e);
-
-void encode_signal(Event e);
+void broadcast(const array& in, array& out);

 } // namespace mlx::core
--- a/mlx/backend/common/buffer_cache.h
+++ b/mlx/backend/common/buffer_cache.h
@@ -0,0 +1,157 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <cassert>
+#include <functional>
+#include <map>
+
+namespace mlx::core {
+
+template <typename T>
+class BufferCache {
+ public:
+  BufferCache(
+      size_t page_size,
+      std::function<size_t(T*)> get_size,
+      std::function<void(T*)> free)
+      : page_size_(page_size),
+        get_size_(std::move(get_size)),
+        free_(std::move(free)) {}
+
+  ~BufferCache() {
+    clear();
+  }
+
+  BufferCache(const BufferCache&) = delete;
+  BufferCache& operator=(const BufferCache&) = delete;
+
+  T* reuse_from_cache(size_t size) {
+    // Find the closest buffer in pool.
+    auto it = buffer_pool_.lower_bound(size);
+    if (it == buffer_pool_.end() ||
+        it->first >= std::min(2 * size, size + 2 * page_size_)) {
+      return nullptr;
+    }
+
+    // Collect from the cache.
+    T* buf = it->second->buf;
+    pool_size_ -= it->first;
+
+    // Remove from record.
+    remove_from_list(it->second);
+    buffer_pool_.erase(it);
+    return buf;
+  }
+
+  void recycle_to_cache(T* buf) {
+    assert(buf);
+    // Add to cache.
+    BufferHolder* bh = new BufferHolder(buf);
+    add_at_head(bh);
+    size_t size = get_size_(buf);
+    pool_size_ += size;
+    buffer_pool_.emplace(size, bh);
+  }
+
+  int release_cached_buffers(size_t min_bytes_to_free) {
+    if (min_bytes_to_free >= 0.9 * pool_size_) {
+      return clear();
+    } else {
+      int n_release = 0;
+      size_t total_bytes_freed = 0;
+
+      while (tail_ && (total_bytes_freed < min_bytes_to_free)) {
+        // Release buffer.
+        size_t size = get_size_(tail_->buf);
+        total_bytes_freed += size;
+        free_(tail_->buf);
+        n_release++;
+
+        // Remove from record.
+        auto its = buffer_pool_.equal_range(size);
+        auto it = std::find_if(its.first, its.second, [this](const auto& el) {
+          return el.second == tail_;
+        });
+        assert(it != buffer_pool_.end());
+        buffer_pool_.erase(it);
+        remove_from_list(tail_);
+      }
+
+      pool_size_ -= total_bytes_freed;
+      return n_release;
+    }
+  }
+
+  int clear() {
+    int n_release = 0;
+    for (auto& [size, holder] : buffer_pool_) {
+      free_(holder->buf);
+      n_release++;
+      delete holder;
+    }
+    buffer_pool_.clear();
+    pool_size_ = 0;
+    head_ = nullptr;
+    tail_ = nullptr;
+    return n_release;
+  }
+
+  size_t cache_size() const {
+    return pool_size_;
+  }
+
+  size_t page_size() const {
+    return page_size_;
+  }
+
+ private:
+  struct BufferHolder {
+   public:
+    explicit BufferHolder(T* buf_) : buf(buf_) {}
+
+    BufferHolder* prev{nullptr};
+    BufferHolder* next{nullptr};
+    T* buf;
+  };
+
+  void add_at_head(BufferHolder* to_add) {
+    if (!head_) {
+      head_ = to_add;
+      tail_ = to_add;
+    } else {
+      head_->prev = to_add;
+      to_add->next = head_;
+      head_ = to_add;
+    }
+  }
+
+  void remove_from_list(BufferHolder* to_remove) {
+    if (to_remove->prev && to_remove->next) { // if middle
+      to_remove->prev->next = to_remove->next;
+      to_remove->next->prev = to_remove->prev;
+    } else if (to_remove->prev && to_remove == tail_) { // if tail
+      tail_ = to_remove->prev;
+      tail_->next = nullptr;
+    } else if (to_remove == head_ && to_remove->next) { // if head
+      head_ = to_remove->next;
+      head_->prev = nullptr;
+    } else if (to_remove == head_ && to_remove == tail_) { // if only element
+      head_ = nullptr;
+      tail_ = nullptr;
+    }
+
+    delete to_remove;
+  }
+
+  std::multimap<size_t, BufferHolder*> buffer_pool_;
+  BufferHolder* head_{nullptr};
+  BufferHolder* tail_{nullptr};
+  size_t pool_size_{0};
+
+  const size_t page_size_;
+  std::function<size_t(T*)> get_size_;
+  std::function<void(T*)> free_;
+};
+
+} // namespace mlx::core
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -1,6 +1,7 @@
 // Copyright © 2024 Apple Inc.
 #include <cassert>

+#include "mlx/backend/common/broadcasting.h"
 #include "mlx/backend/common/utils.h"
 #include "mlx/primitives.h"

@@ -39,24 +40,7 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {
  // rely on data_size anyway.
  size_t data_size = out.size();

-  return move_or_copy(in, out, strides_, flags, data_size, offset_);
-}
-
-void broadcast(const array& in, array& out) {
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-  Strides strides(out.ndim(), 0);
-  int diff = out.ndim() - in.ndim();
-  for (int i = in.ndim() - 1; i >= 0; --i) {
-    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
-  }
-  auto flags = in.flags();
-  if (out.size() > in.size()) {
-    flags.row_contiguous = flags.col_contiguous = false;
-  }
-  move_or_copy(in, out, strides, flags, in.data_size());
+  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
 }

 void Broadcast::eval(const std::vector<array>& inputs, array& out) {
@@ -69,7 +53,7 @@ void BroadcastAxes::eval(const std::vector<array>& inputs, array& out) {

 void Copy::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  move_or_copy(inputs[0], out);
+  out.copy_shared_buffer(inputs[0]);
 }

 void CustomTransforms::eval(
@@ -78,7 +62,7 @@ void CustomTransforms::eval(
  assert(inputs.size() > outputs.size());
  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
       i++, j++) {
-    move_or_copy(inputs[j], outputs[i]);
+    outputs[i].copy_shared_buffer(inputs[j]);
  }
 }

@@ -87,7 +71,7 @@ void Depends::eval(
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
  for (int i = 0; i < outputs.size(); i++) {
-    move_or_copy(inputs[i], outputs[i]);
+    outputs[i].copy_shared_buffer(inputs[i]);
  }
 }

@@ -98,12 +82,12 @@ void ExpandDims::eval(const std::vector<array>& inputs, array& out) {
  for (auto ax : axes_) {
    strides.insert(strides.begin() + ax, 1);
  }
-  move_or_copy(in, out, strides, in.flags(), in.data_size());
+  out.copy_shared_buffer(in, strides, in.flags(), in.data_size());
 }

 void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  out.set_data(allocator::malloc(out.nbytes()));

  double numel = 1;
  for (auto ax : axes_) {
@@ -210,7 +194,7 @@ void shared_buffer_reshape(
    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
  }
-  move_or_copy(in, out, out_strides, flags, in.data_size());
+  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
 }

 void Split::eval(
@@ -276,12 +260,12 @@ void Squeeze::eval(const std::vector<array>& inputs, array& out) {
      strides.push_back(in.strides(i));
    }
  }
-  move_or_copy(in, out, strides, in.flags(), in.data_size());
+  out.copy_shared_buffer(in, strides, in.flags(), in.data_size());
 }

 void StopGradient::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  move_or_copy(inputs[0], out);
+  out.copy_shared_buffer(inputs[0]);
 }

 void Transpose::eval(const std::vector<array>& inputs, array& out) {
@@ -315,7 +299,7 @@ void Transpose::eval(const std::vector<array>& inputs, array& out) {
      b_stride *= out.shape(ri);
    }
  }
-  move_or_copy(in, out, out_strides, flags, in.data_size());
+  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
 }

 } // namespace mlx::core
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -1,8 +1,7 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/backend/common/compiled.h"
-#include "mlx/graph_utils.h"
-#include "mlx/primitives.h"
+#include "mlx/backend/common/utils.h"
 #include "mlx/utils.h"

 namespace mlx::core {
@@ -15,6 +14,8 @@ void print_constant(std::ostream& os, const array& x) {
      return print_float_constant<float16_t>(os, x);
    case bfloat16:
      return print_float_constant<bfloat16_t>(os, x);
+    case float64:
+      return print_float_constant<double>(os, x);
    case complex64:
      return print_complex_constant<complex64_t>(os, x);
    case int8:
@@ -51,6 +52,8 @@ std::string get_type_string(Dtype d) {
      return "float16_t";
    case bfloat16:
      return "bfloat16_t";
+    case float64:
+      return "double";
    case complex64:
      return "complex64_t";
    case bool_:
@@ -79,55 +82,6 @@ std::string get_type_string(Dtype d) {
  }
 }

-std::string build_lib_name(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs,
-    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids) {
-  NodeNamer namer;
-  std::ostringstream os;
-  std::ostringstream constant_hasher;
-
-  // Fill the input names. This is not really necessary, I just like having A,
-  // B, C, ... as the inputs.
-  for (auto& x : inputs) {
-    namer.get_name(x);
-  }
-
-  // The primitives describing the tape. For unary and binary primitives this
-  // must be enough to describe the full computation.
-  for (auto& a : tape) {
-    // name and type of output
-    os << namer.get_name(a) << kindof(a.dtype()) << a.itemsize();
-    // computation performed
-    a.primitive().print(os);
-    // name of inputs to the function
-    for (auto& inp : a.inputs()) {
-      os << namer.get_name(inp);
-    }
-  }
-  os << "_";
-
-  for (auto& x : inputs) {
-    if (constant_ids.find(x.id()) != constant_ids.end()) {
-      os << "C";
-      print_constant(constant_hasher, x);
-    } else {
-      os << (is_scalar(x) ? "S" : "V");
-    }
-  }
-  os << "_";
-  for (auto& x : inputs) {
-    if (constant_ids.find(x.id()) != constant_ids.end()) {
-      continue;
-    }
-    os << kindof(x.dtype()) << x.itemsize();
-  }
-  os << "_" << std::hash<std::string>{}(constant_hasher.str());
-
-  return os.str();
-}
-
 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
    const Shape& shape) {
@@ -159,10 +113,8 @@ bool compiled_check_contiguity(
 void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::vector<array>& inputs_,
-    const std::unordered_set<uintptr_t>& constant_ids_,
-    bool contiguous,
-    bool move_buffers /* = false */) {
+    const std::function<bool(size_t)>& is_constant,
+    bool contiguous) {
  if (contiguous) {
    int o = 0;
    Strides strides;
@@ -176,13 +128,8 @@ void compiled_allocate_outputs(
      // - Donatable
      // - Not a constant
      if (in.itemsize() == outputs[o].itemsize() && !is_scalar(in) &&
-          in.is_donatable() &&
-          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
-        if (move_buffers) {
-          outputs[o++].move_shared_buffer(in);
-        } else {
-          outputs[o++].copy_shared_buffer(in);
-        }
+          in.is_donatable() && is_constant(i)) {
+        outputs[o++].copy_shared_buffer(in);
      }
      // Get representative input flags to properly set non-donated outputs
      if (strides.empty() && in.size() == outputs[0].size()) {
@@ -193,7 +140,7 @@ void compiled_allocate_outputs(
    }
    for (; o < outputs.size(); ++o) {
      outputs[o].set_data(
-          allocator::malloc_or_wait(data_size * outputs[o].itemsize()),
+          allocator::malloc(data_size * outputs[o].itemsize()),
          data_size,
          strides,
          flags);
@@ -209,21 +156,86 @@ void compiled_allocate_outputs(
      // - Not a constant
      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
-          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
-        if (move_buffers) {
-          outputs[o].move_shared_buffer(
-              in, outputs[o].strides(), in.flags(), in.data_size());
-        } else {
-          outputs[o].copy_shared_buffer(
-              in, outputs[o].strides(), in.flags(), in.data_size());
-        }
+          is_constant(i)) {
+        outputs[o].copy_shared_buffer(
+            in, outputs[o].strides(), in.flags(), in.data_size());
        o++;
      }
    }
    for (; o < outputs.size(); ++o) {
-      outputs[o].set_data(allocator::malloc_or_wait(outputs[o].nbytes()));
+      outputs[o].set_data(allocator::malloc(outputs[o].nbytes()));
    }
  }
 }

+std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
+    const std::vector<array>& inputs,
+    const array& out,
+    const std::function<bool(size_t)>& is_constant) {
+  const Shape& shape = out.shape();
+  bool contiguous = compiled_check_contiguity(inputs, shape);
+  if (contiguous) {
+    return {true, shape, {}};
+  }
+
+  std::vector<Strides> strides_vec{out.strides()};
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    // Skip constants.
+    if (is_constant(i)) {
+      continue;
+    }
+
+    // Skip scalar inputs.
+    const auto& x = inputs[i];
+    if (is_scalar(x)) {
+      continue;
+    }
+
+    // Broadcast the inputs to the output shape.
+    Strides xstrides;
+    size_t j = 0;
+    for (; j < shape.size() - x.ndim(); ++j) {
+      if (shape[j] == 1) {
+        xstrides.push_back(out.strides()[j]);
+      } else {
+        xstrides.push_back(0);
+      }
+    }
+    for (size_t i = 0; i < x.ndim(); ++i, ++j) {
+      if (x.shape(i) == 1) {
+        if (shape[j] == 1) {
+          xstrides.push_back(out.strides()[j]);
+        } else {
+          xstrides.push_back(0);
+        }
+      } else {
+        xstrides.push_back(x.strides()[i]);
+      }
+    }
+    strides_vec.push_back(std::move(xstrides));
+  }
+
+  auto tup = collapse_contiguous_dims(shape, strides_vec, INT32_MAX);
+  return {false, std::move(std::get<0>(tup)), std::move(std::get<1>(tup))};
+}
+
+bool compiled_use_large_index(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
+    bool contiguous) {
+  if (contiguous) {
+    size_t max_size = 0;
+    for (const auto& in : inputs) {
+      max_size = std::max(max_size, in.data_size());
+    }
+    return max_size > UINT32_MAX;
+  } else {
+    size_t max_size = 0;
+    for (const auto& o : outputs) {
+      max_size = std::max(max_size, o.size());
+    }
+    return max_size > UINT32_MAX;
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/compiled.h
+++ b/mlx/backend/common/compiled.h
@@ -1,9 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.
 #pragma once

+#include <functional>
 #include <iomanip>
-#include <sstream>
-#include <unordered_set>

 #include "mlx/array.h"
 #include "mlx/primitives.h"
@@ -14,19 +13,17 @@ inline bool is_static_cast(const Primitive& p) {
  return (typeid(p) == typeid(Broadcast) || typeid(p) == typeid(AsType));
 }

-std::string build_lib_name(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs,
-    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids);
-
 std::string get_type_string(Dtype d);

 template <typename T>
 void print_float_constant(std::ostream& os, const array& x) {
  auto old_precision = os.precision();
-  os << std::setprecision(std::numeric_limits<float>::digits10 + 1)
-     << x.item<T>() << std::setprecision(old_precision);
+  if constexpr (std::is_same_v<T, double>) {
+    os << std::setprecision(std::numeric_limits<double>::digits10 + 1);
+  } else {
+    os << std::setprecision(std::numeric_limits<float>::digits10 + 1);
+  }
+  os << x.item<T>() << std::setprecision(old_precision);
 }

 template <typename T>
@@ -60,9 +57,19 @@ bool compiled_check_contiguity(
 void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::vector<array>& inputs_,
-    const std::unordered_set<uintptr_t>& constant_ids_,
-    bool contiguous,
-    bool move_buffers = false);
+    const std::function<bool(size_t)>& is_constant,
+    bool contiguous);
+
+// Collapse contiguous dims ignoring scalars and constants.
+std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
+    const std::vector<array>& inputs,
+    const array& out,
+    const std::function<bool(size_t)>& is_constant);
+
+// Return whether the kernel should use large index.
+bool compiled_use_large_index(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
+    bool contiguous);

 } // namespace mlx::core
--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@@ -2,7 +2,7 @@

 #pragma once

-#include "mlx/array.h"
+#include "mlx/backend/common/utils.h"

 namespace mlx::core {

@@ -22,4 +22,25 @@ enum class CopyType {
  GeneralGeneral
 };

+inline bool set_copy_output_data(const array& in, array& out, CopyType ctype) {
+  if (ctype == CopyType::Vector) {
+    // If the input is donateable, we are doing a vector copy and the types
+    // have the same size, then the input buffer can hold the output.
+    if (is_donatable(in, out)) {
+      out.copy_shared_buffer(in);
+      return true;
+    } else {
+      out.set_data(
+          allocator::malloc(in.data_size() * out.itemsize()),
+          in.data_size(),
+          in.strides(),
+          in.flags());
+      return false;
+    }
+  } else {
+    out.set_data(allocator::malloc(out.nbytes()));
+    return false;
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/hadamard.h
+++ b/mlx/backend/common/hadamard.h
@@ -99,7 +99,11 @@ inline std::pair<int, int> decompose_hadamard(int n) {
          "[hadamard] Only supports n = m*2^k where m in (1, 12, 20, 28).");
    }
  }
+  if (n > (1 << 26)) {
+    throw std::invalid_argument(
+        "[hadamard] Only supports n = m*2^k where k <= 26");
+  }
  return {n, m};
 }

-} // namespace mlx::core
+} // namespace mlx::core
--- a/mlx/backend/common/load.cpp
+++ b/mlx/backend/common/load.cpp
@@ -3,7 +3,8 @@
 #include <algorithm>
 #include <utility>

-#include "mlx/backend/common/load.h"
+#include "mlx/primitives.h"
+#include "mlx/scheduler.h"

 namespace {

@@ -26,26 +27,31 @@ void swap_endianness(uint8_t* data_bytes, size_t N) {

 namespace mlx::core {

-void load(
-    array& out,
-    size_t offset,
-    const std::shared_ptr<io::Reader>& reader,
-    bool swap_endianness_) {
-  reader->read(out.data<char>(), out.nbytes(), offset);
-
-  if (swap_endianness_) {
-    switch (out.itemsize()) {
-      case 2:
-        swap_endianness<2>(out.data<uint8_t>(), out.data_size());
-        break;
-      case 4:
-        swap_endianness<4>(out.data<uint8_t>(), out.data_size());
-        break;
-      case 8:
-        swap_endianness<8>(out.data<uint8_t>(), out.data_size());
-        break;
+void Load::eval_cpu(const std::vector<array>& inputs, array& out) {
+  out.set_data(allocator::malloc(out.nbytes()));
+  auto read_task = [out_ptr = out.data<char>(),
+                    size = out.size(),
+                    itemsize = out.itemsize(),
+                    offset = offset_,
+                    reader = reader_,
+                    swap_endianness_ = swap_endianness_]() mutable {
+    reader->read(out_ptr, size * itemsize, offset);
+    if (swap_endianness_) {
+      switch (itemsize) {
+        case 2:
+          swap_endianness<2>(reinterpret_cast<uint8_t*>(out_ptr), size);
+          break;
+        case 4:
+          swap_endianness<4>(reinterpret_cast<uint8_t*>(out_ptr), size);
+          break;
+        case 8:
+          swap_endianness<8>(reinterpret_cast<uint8_t*>(out_ptr), size);
+          break;
+      }
    }
-  }
+  };
+  auto fut = io::thread_pool().enqueue(std::move(read_task)).share();
+  scheduler::enqueue(stream(), [fut = std::move(fut)]() { fut.wait(); });
 }

 } // namespace mlx::core
--- a/mlx/backend/common/load.h
+++ b/mlx/backend/common/load.h
@@ -1,14 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include "mlx/array.h"
-#include "mlx/io/load.h"
-
-namespace mlx::core {
-
-void load(
-    array& out,
-    size_t offset,
-    const std::shared_ptr<io::Reader>& reader,
-    bool swap_endianess);
-
-} // namespace mlx::core
--- a/mlx/backend/common/matmul.h
+++ b/mlx/backend/common/matmul.h
@@ -0,0 +1,67 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/utils.h"
+
+#include <sstream>
+
+namespace mlx::core {
+
+inline std::tuple<Shape, Strides, Strides> collapse_batches(
+    const array& a,
+    const array& b) {
+  if (a.ndim() == 2) {
+    return {{1}, {0}, {0}};
+  }
+
+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
+  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
+  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
+
+  auto [batch_shape, batch_strides] =
+      collapse_contiguous_dims(A_bshape, std::vector{A_bstride, B_bstride});
+
+  auto a_batch_strides = batch_strides[0];
+  auto b_batch_strides = batch_strides[1];
+
+  if (batch_shape.empty()) {
+    batch_shape.push_back(1);
+    a_batch_strides.push_back(0);
+    b_batch_strides.push_back(0);
+  }
+
+  return std::make_tuple(batch_shape, a_batch_strides, b_batch_strides);
+}
+
+inline std::tuple<Shape, Strides, Strides, Strides>
+collapse_batches(const array& a, const array& b, const array& c) {
+  if (a.ndim() == 2) {
+    return {{1}, {0}, {0}, {0}};
+  }
+
+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
+  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
+  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
+  Strides C_bstride{c.strides().begin(), c.strides().end() - 2};
+
+  auto [batch_shape, batch_strides] = collapse_contiguous_dims(
+      A_bshape, std::vector{A_bstride, B_bstride, C_bstride});
+
+  auto A_batch_stride = batch_strides[0];
+  auto B_batch_stride = batch_strides[1];
+  auto C_batch_stride = batch_strides[2];
+
+  if (batch_shape.empty()) {
+    batch_shape.push_back(1);
+    A_batch_stride.push_back(0);
+    B_batch_stride.push_back(0);
+    C_batch_stride.push_back(0);
+  }
+
+  return std::make_tuple(
+      batch_shape, A_batch_stride, B_batch_stride, C_batch_stride);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -5,11 +5,9 @@
 namespace mlx::core {

 std::pair<Shape, Strides> shapes_without_reduction_axes(
-    const array& x,
+    Shape shape,
+    Strides strides,
    const std::vector<int>& axes) {
-  auto shape = x.shape();
-  auto strides = x.strides();
-
  for (int i = axes.size() - 1; i >= 0; i--) {
    int a = axes[i];
    shape.erase(shape.begin() + a);
@@ -19,6 +17,15 @@ std::pair<Shape, Strides> shapes_without_reduction_axes(
  return std::make_pair(shape, strides);
 }

+std::pair<Shape, Strides> shapes_without_reduction_axes(
+    const array& x,
+    const std::vector<int>& axes) {
+  auto shape = x.shape();
+  auto strides = x.strides();
+  return shapes_without_reduction_axes(
+      std::move(shape), std::move(strides), axes);
+}
+
 ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // The data is all there and we are reducing over everything
  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -51,5 +51,9 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);
 std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes);
+std::pair<Shape, Strides> shapes_without_reduction_axes(
+    Shape shape,
+    Strides strides,
+    const std::vector<int>& axes);

 } // namespace mlx::core
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -36,7 +36,7 @@ void shared_buffer_slice(
  flags.col_contiguous = is_col_contiguous;
  flags.contiguous = (no_bsx_size == data_size);

-  move_or_copy(in, out, out_strides, flags, data_size, data_offset);
+  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
 }

 void slice(
--- a/mlx/backend/common/ternary.h
+++ b/mlx/backend/common/ternary.h
@@ -36,15 +36,10 @@ inline void set_ternary_op_output_data(
    const array& b,
    const array& c,
    array& out,
-    TernaryOpType topt,
-    bool donate_with_move = false) {
-  auto maybe_donate = [&out, donate_with_move](const array& x) {
+    TernaryOpType topt) {
+  auto maybe_donate = [&out](const array& x) {
    if (is_donatable(x, out)) {
-      if (donate_with_move) {
-        out.move_shared_buffer(x);
-      } else {
-        out.copy_shared_buffer(x);
-      }
+      out.copy_shared_buffer(x);
      return true;
    }
    return false;
@@ -53,12 +48,12 @@ inline void set_ternary_op_output_data(
  switch (topt) {
    case TernaryOpType::ScalarScalarScalar:
      out.set_data(
-          allocator::malloc_or_wait(out.itemsize()), 1, b.strides(), b.flags());
+          allocator::malloc(out.itemsize()), 1, b.strides(), b.flags());
      break;
    case TernaryOpType::VectorVectorVector:
      if (!(maybe_donate(a) || maybe_donate(b) || maybe_donate(c))) {
        out.set_data(
-            allocator::malloc_or_wait(out.itemsize() * b.data_size()),
+            allocator::malloc(out.itemsize() * b.data_size()),
            b.data_size(),
            b.strides(),
            b.flags());
@@ -69,7 +64,7 @@ inline void set_ternary_op_output_data(
      if (!((a.flags().row_contiguous && maybe_donate(a)) ||
            (b.flags().row_contiguous && maybe_donate(b)) ||
            (c.flags().row_contiguous && maybe_donate(c)))) {
-        out.set_data(allocator::malloc_or_wait(out.nbytes()));
+        out.set_data(allocator::malloc(out.nbytes()));
      }
      break;
  }
--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@@ -0,0 +1,26 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+inline void set_unary_output_data(const array& in, array& out) {
+  if (in.flags().contiguous) {
+    if (is_donatable(in, out)) {
+      out.copy_shared_buffer(in);
+    } else {
+      out.set_data(
+          allocator::malloc(in.data_size() * out.itemsize()),
+          in.data_size(),
+          in.strides(),
+          in.flags());
+    }
+  } else {
+    out.set_data(allocator::malloc(out.nbytes()));
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@@ -1,29 +1,20 @@
 // Copyright © 2023-2024 Apple Inc.

+#include <dlfcn.h>
+
 #include "mlx/backend/common/utils.h"

 namespace mlx::core {

-void move_or_copy(const array& in, array& out) {
-  if (in.is_donatable()) {
-    out.move_shared_buffer(in);
-  } else {
-    out.copy_shared_buffer(in);
-  }
-}
-
-void move_or_copy(
-    const array& in,
-    array& out,
-    const Strides& strides,
-    array::Flags flags,
-    size_t data_size,
-    size_t offset /* = 0 */) {
-  if (in.is_donatable()) {
-    out.move_shared_buffer(in, strides, flags, data_size, offset);
-  } else {
-    out.copy_shared_buffer(in, strides, flags, data_size, offset);
-  }
+std::filesystem::path current_binary_dir() {
+  static std::filesystem::path binary_dir = []() {
+    Dl_info info;
+    if (!dladdr(reinterpret_cast<void*>(&current_binary_dir), &info)) {
+      throw std::runtime_error("Unable to get current binary dir.");
+    }
+    return std::filesystem::path(info.dli_fname).parent_path();
+  }();
+  return binary_dir;
 }

 std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
@@ -123,4 +114,145 @@ std::pair<Shape, Strides> collapse_contiguous_dims(
  return collapse_contiguous_dims(a.shape(), a.strides(), size_cap);
 }

+Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 /* = 10 */) {
+  int pows[3] = {0, 0, 0};
+  int sum = 0;
+  while (true) {
+    int presum = sum;
+    // Check all the pows
+    if (dim0 >= (1 << (pows[0] + 1))) {
+      pows[0]++;
+      sum++;
+    }
+    if (sum == 10) {
+      break;
+    }
+    if (dim1 >= (1 << (pows[1] + 1))) {
+      pows[1]++;
+      sum++;
+    }
+    if (sum == 10) {
+      break;
+    }
+    if (dim2 >= (1 << (pows[2] + 1))) {
+      pows[2]++;
+      sum++;
+    }
+    if (sum == presum || sum == pow2) {
+      break;
+    }
+  }
+  return std::make_tuple(1ul << pows[0], 1ul << pows[1], 1ul << pows[2]);
+}
+
+Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides) {
+  // Dims with strides of 0 are ignored as they
+  // correspond to broadcasted dimensions
+  size_t grid_x = 1;
+  size_t grid_y = 1;
+  for (int i = 0; i < shape.size(); ++i) {
+    if (strides[i] == 0) {
+      continue;
+    }
+    if (grid_x * shape[i] < UINT32_MAX) {
+      grid_x *= shape[i];
+    } else {
+      grid_y *= shape[i];
+    }
+  }
+  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
+    throw std::runtime_error("Unable to safely factor shape.");
+  }
+  if (grid_y > grid_x) {
+    std::swap(grid_x, grid_y);
+  }
+  return std::make_tuple(
+      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
+}
+
+Dims get_2d_grid_dims_common(
+    const Shape& shape,
+    const Strides& strides,
+    size_t divisor) {
+  // Compute the 2d grid dimensions such that the total size of the grid is
+  // divided by divisor.
+  size_t grid_x = 1;
+  size_t grid_y = 1;
+  for (int i = 0; i < shape.size(); ++i) {
+    if (strides[i] == 0) {
+      continue;
+    }
+
+    // No need to add this shape we can just remove it from the divisor.
+    if (divisor % shape[i] == 0) {
+      divisor /= shape[i];
+      continue;
+    }
+
+    if (grid_x * shape[i] < UINT32_MAX) {
+      grid_x *= shape[i];
+    } else {
+      grid_y *= shape[i];
+    }
+
+    if (divisor > 1) {
+      if (grid_x % divisor == 0) {
+        grid_x /= divisor;
+        divisor = 1;
+      } else if (grid_y % divisor == 0) {
+        grid_y /= divisor;
+        divisor = 1;
+      }
+    }
+  }
+  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
+    throw std::runtime_error("Unable to safely factor shape.");
+  }
+  if (grid_y > grid_x) {
+    std::swap(grid_x, grid_y);
+  }
+  if (divisor > 1) {
+    grid_x = ((grid_x + divisor - 1) / divisor) * divisor;
+  }
+  return std::make_tuple(
+      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
+}
+
+std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2) {
+  auto [bx, by, bz] = get_block_dims_common(dim0, dim1, dim2);
+  auto gx = (dim0 + bx - 1) / bx;
+  auto gy = (dim1 + by - 1) / by;
+  auto gz = (dim2 + bz - 1) / bz;
+
+  return std::make_pair(
+      std::make_tuple(gx, gy, gz), std::make_tuple(bx, by, bz));
+}
+
+array swapaxes_in_eval(const array& x, int axis1, int axis2) {
+  int ndim = x.ndim();
+  if (axis1 < 0) {
+    axis1 += ndim;
+  }
+  if (axis2 < 0) {
+    axis2 += ndim;
+  }
+
+  auto shape = x.shape();
+  std::swap(shape[axis1], shape[axis2]);
+  auto strides = x.strides();
+  std::swap(strides[axis1], strides[axis2]);
+
+  auto [data_size, row_contiguous, col_contiguous] =
+      check_contiguity(shape, strides);
+  bool contiguous = data_size == x.data_size();
+
+  array out(std::move(shape), x.dtype(), nullptr, {});
+  out.copy_shared_buffer(
+      x,
+      std::move(strides),
+      {contiguous, row_contiguous, col_contiguous},
+      x.data_size());
+  return out;
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -2,12 +2,17 @@

 #pragma once

+#include <filesystem>
+#include <tuple>
 #include <vector>

 #include "mlx/array.h"

 namespace mlx::core {

+// Return the directory that contains current shared library.
+std::filesystem::path current_binary_dir();
+
 inline int64_t
 elem_to_loc(int elem, const Shape& shape, const Strides& strides) {
  int64_t loc = 0;
@@ -70,6 +75,31 @@ std::pair<Shape, Strides> collapse_contiguous_dims(
    const array& a,
    int64_t size_cap = std::numeric_limits<int32_t>::max());

+// Compute the thread block dimensions which fit the given
+// input dimensions.
+// - The thread block dimensions will be powers of two
+// - The thread block size will be less than 2^pow2
+using Dims = std::tuple<uint32_t, uint32_t, uint32_t>;
+Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 = 10);
+
+// Computes a 2D grid where each element is < UINT_MAX
+// Assumes:
+// - overall size (product of non-broadcasted dimensions) is < UINT_MAX^2
+// - shape and strides correspond to a contiguous (no holes) but
+//   possibly broadcasted array
+Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides);
+
+// Same as above but we do an implicit division with divisor.
+// Basically, equivalent to factorizing
+//    Prod(s \forall s in shape if strides[s] > 0) / divisor.
+Dims get_2d_grid_dims_common(
+    const Shape& shape,
+    const Strides& strides,
+    size_t divisor);
+
+// Get both the block and a grid of blocks that covers dim0, dim1 and dim2.
+std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2);
+
 struct ContiguousIterator {
  inline void step() {
    int dims = shape_.size();
@@ -159,19 +189,20 @@ inline bool is_donatable(const array& in, const array& out) {
      in.buffer_size() <= out.nbytes() + donation_extra;
 }

-void move_or_copy(const array& in, array& out);
-void move_or_copy(
-    const array& in,
-    array& out,
-    const Strides& strides,
-    array::Flags flags,
-    size_t data_size,
-    size_t offset = 0);
-
 std::pair<bool, Strides> prepare_reshape(const array& in, const array& out);

 void shared_buffer_reshape(
    const array& in,
    const Strides& out_strides,
    array& out);
+
+// Like the swapaxes op but safe to call in eval_gpu.
+array swapaxes_in_eval(const array& x, int axis1, int axis2);
+
+template <typename T>
+inline SmallVector<T> remove_index(SmallVector<T> vec, size_t index) {
+  vec.erase(std::next(vec.begin(), index));
+  return vec;
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/CMakeLists.txt
+++ b/mlx/backend/cpu/CMakeLists.txt
@@ -40,11 +40,15 @@ add_dependencies(mlx cpu_compiled_preamble)

 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/available.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/eig.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/encoder.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
@@ -56,6 +60,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
@@ -65,13 +70,14 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp)

 if(MLX_BUILD_ACCELERATE)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/bnns.cpp)
 else()
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/no_fp16.cpp
-                             ${CMAKE_CURRENT_SOURCE_DIR}/gemms/no_bf16.cpp)
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/simd_fp16.cpp
+                             ${CMAKE_CURRENT_SOURCE_DIR}/gemms/simd_bf16.cpp)
 endif()

 if(IOS)
--- a/mlx/backend/cpu/arange.h
+++ b/mlx/backend/cpu/arange.h
@@ -2,76 +2,27 @@

 #pragma once

-#include "mlx/allocator.h"
 #include "mlx/array.h"
+#include "mlx/backend/cpu/encoder.h"

 namespace mlx::core {

 namespace {

 template <typename T>
-void arange(T start, T next, array& out, size_t size) {
+void arange(T start, T next, array& out, size_t size, Stream stream) {
  auto ptr = out.data<T>();
  auto step_size = next - start;
-  for (int i = 0; i < size; ++i) {
-    ptr[i] = start;
-    start += step_size;
-  }
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_output_array(out);
+  encoder.dispatch([ptr, start, step_size, size]() mutable {
+    for (int i = 0; i < size; ++i) {
+      ptr[i] = start;
+      start += step_size;
+    }
+  });
 }

 } // namespace

-void arange(
-    const std::vector<array>& inputs,
-    array& out,
-    double start,
-    double step) {
-  assert(inputs.size() == 0);
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-  switch (out.dtype()) {
-    case bool_:
-      throw std::runtime_error("Bool type unsupported for arange.");
-      break;
-    case uint8:
-      arange<uint8_t>(start, start + step, out, out.size());
-      break;
-    case uint16:
-      arange<uint16_t>(start, start + step, out, out.size());
-      break;
-    case uint32:
-      arange<uint32_t>(start, start + step, out, out.size());
-      break;
-    case uint64:
-      arange<uint64_t>(start, start + step, out, out.size());
-      break;
-    case int8:
-      arange<int8_t>(start, start + step, out, out.size());
-      break;
-    case int16:
-      arange<int16_t>(start, start + step, out, out.size());
-      break;
-    case int32:
-      arange<int32_t>(start, start + step, out, out.size());
-      break;
-    case int64:
-      arange<int64_t>(start, start + step, out, out.size());
-      break;
-    case float16:
-      arange<float16_t>(start, start + step, out, out.size());
-      break;
-    case float32:
-      arange<float>(start, start + step, out, out.size());
-      break;
-    case float64:
-      arange<double>(start, start + step, out, out.size());
-      break;
-    case bfloat16:
-      arange<bfloat16_t>(start, start + step, out, out.size());
-      break;
-    case complex64:
-      arange<complex64_t>(start, start + step, out, out.size());
-      break;
-  }
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cpu/arg_reduce.cpp
+++ b/mlx/backend/cpu/arg_reduce.cpp
@@ -3,6 +3,7 @@
 #include <cassert>

 #include "mlx/backend/common/utils.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -13,19 +14,20 @@ template <typename InT, typename OpT>
 void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto axis_size = in.shape()[axis];
  auto axis_stride = in.strides()[axis];
-  Strides strides = in.strides();
-  Shape shape = in.shape();
-  strides.erase(strides.begin() + axis);
-  shape.erase(shape.begin() + axis);
+  Strides strides = remove_index(in.strides(), axis);
+  Shape shape = remove_index(in.shape(), axis);
+  auto in_ptr = in.data<InT>();
+  auto out_ptr = out.data<uint32_t>();
+
  for (uint32_t i = 0; i < out.size(); ++i) {
    auto loc = elem_to_loc(i, shape, strides);
-    auto in_ptr = in.data<InT>() + loc;
+    auto local_in_ptr = in_ptr + loc;
    uint32_t ind_v = 0;
-    InT v = (*in_ptr);
-    for (uint32_t j = 0; j < axis_size; ++j, in_ptr += axis_stride) {
-      op(j, (*in_ptr), &ind_v, &v);
+    InT v = (*local_in_ptr);
+    for (uint32_t j = 0; j < axis_size; ++j, local_in_ptr += axis_stride) {
+      op(j, (*local_in_ptr), &ind_v, &v);
    }
-    out.data<uint32_t>()[i] = ind_v;
+    out_ptr[i] = ind_v;
  }
 }

@@ -64,52 +66,59 @@ void arg_reduce_dispatch(
 void ArgReduce::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-  switch (in.dtype()) {
-    case bool_:
-      arg_reduce_dispatch<bool>(in, out, reduce_type_, axis_);
-      break;
-    case uint8:
-      arg_reduce_dispatch<uint8_t>(in, out, reduce_type_, axis_);
-      break;
-    case uint16:
-      arg_reduce_dispatch<uint16_t>(in, out, reduce_type_, axis_);
-      break;
-    case uint32:
-      arg_reduce_dispatch<uint32_t>(in, out, reduce_type_, axis_);
-      break;
-    case uint64:
-      arg_reduce_dispatch<uint64_t>(in, out, reduce_type_, axis_);
-      break;
-    case int8:
-      arg_reduce_dispatch<int8_t>(in, out, reduce_type_, axis_);
-      break;
-    case int16:
-      arg_reduce_dispatch<int16_t>(in, out, reduce_type_, axis_);
-      break;
-    case int32:
-      arg_reduce_dispatch<int32_t>(in, out, reduce_type_, axis_);
-      break;
-    case int64:
-      arg_reduce_dispatch<int64_t>(in, out, reduce_type_, axis_);
-      break;
-    case float16:
-      arg_reduce_dispatch<float16_t>(in, out, reduce_type_, axis_);
-      break;
-    case float32:
-      arg_reduce_dispatch<float>(in, out, reduce_type_, axis_);
-      break;
-    case bfloat16:
-      arg_reduce_dispatch<bfloat16_t>(in, out, reduce_type_, axis_);
-      break;
-    case float64:
-      arg_reduce_dispatch<double>(in, out, reduce_type_, axis_);
-      break;
-    case complex64:
-      arg_reduce_dispatch<complex64_t>(in, out, reduce_type_, axis_);
-      break;
-  }
+  out.set_data(allocator::malloc(out.nbytes()));
+  auto& encoder = cpu::get_command_encoder(stream());
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  encoder.dispatch([in = array::unsafe_weak_copy(in),
+                    out = array::unsafe_weak_copy(out),
+                    reduce_type_ = reduce_type_,
+                    axis_ = axis_]() mutable {
+    switch (in.dtype()) {
+      case bool_:
+        arg_reduce_dispatch<bool>(in, out, reduce_type_, axis_);
+        break;
+      case uint8:
+        arg_reduce_dispatch<uint8_t>(in, out, reduce_type_, axis_);
+        break;
+      case uint16:
+        arg_reduce_dispatch<uint16_t>(in, out, reduce_type_, axis_);
+        break;
+      case uint32:
+        arg_reduce_dispatch<uint32_t>(in, out, reduce_type_, axis_);
+        break;
+      case uint64:
+        arg_reduce_dispatch<uint64_t>(in, out, reduce_type_, axis_);
+        break;
+      case int8:
+        arg_reduce_dispatch<int8_t>(in, out, reduce_type_, axis_);
+        break;
+      case int16:
+        arg_reduce_dispatch<int16_t>(in, out, reduce_type_, axis_);
+        break;
+      case int32:
+        arg_reduce_dispatch<int32_t>(in, out, reduce_type_, axis_);
+        break;
+      case int64:
+        arg_reduce_dispatch<int64_t>(in, out, reduce_type_, axis_);
+        break;
+      case float16:
+        arg_reduce_dispatch<float16_t>(in, out, reduce_type_, axis_);
+        break;
+      case float32:
+        arg_reduce_dispatch<float>(in, out, reduce_type_, axis_);
+        break;
+      case bfloat16:
+        arg_reduce_dispatch<bfloat16_t>(in, out, reduce_type_, axis_);
+        break;
+      case float64:
+        arg_reduce_dispatch<double>(in, out, reduce_type_, axis_);
+        break;
+      case complex64:
+        arg_reduce_dispatch<complex64_t>(in, out, reduce_type_, axis_);
+        break;
+    }
+  });
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/available.cpp
+++ b/mlx/backend/cpu/available.cpp
@@ -0,0 +1,11 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cpu/available.h"
+
+namespace mlx::core::cpu {
+
+bool is_available() {
+  return true;
+}
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/available.h
+++ b/mlx/backend/cpu/available.h
@@ -0,0 +1,9 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+namespace mlx::core::cpu {
+
+bool is_available();
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/binary.cpp
+++ b/mlx/backend/cpu/binary.cpp
@@ -8,6 +8,7 @@
 #include "mlx/backend/cpu/binary.h"
 #include "mlx/backend/cpu/binary_ops.h"
 #include "mlx/backend/cpu/binary_two.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

@@ -16,51 +17,221 @@ namespace mlx::core {
 namespace {

 template <typename Op>
-void comparison_op(const array& a, const array& b, array& out, Op op) {
-  switch (a.dtype()) {
-    case bool_:
-      binary_op<bool, bool>(a, b, out, op);
-      break;
-    case uint8:
-      binary_op<uint8_t, bool>(a, b, out, op);
-      break;
-    case uint16:
-      binary_op<uint16_t, bool>(a, b, out, op);
-      break;
-    case uint32:
-      binary_op<uint32_t, bool>(a, b, out, op);
-      break;
-    case uint64:
-      binary_op<uint64_t, bool>(a, b, out, op);
-      break;
-    case int8:
-      binary_op<int8_t, bool>(a, b, out, op);
-      break;
-    case int16:
-      binary_op<int16_t, bool>(a, b, out, op);
-      break;
-    case int32:
-      binary_op<int32_t, bool>(a, b, out, op);
-      break;
-    case int64:
-      binary_op<int64_t, bool>(a, b, out, op);
-      break;
-    case float16:
-      binary_op<float16_t, bool>(a, b, out, op);
-      break;
-    case float32:
-      binary_op<float, bool>(a, b, out, op);
-      break;
-    case float64:
-      binary_op<double, bool>(a, b, out, op);
-      break;
-    case bfloat16:
-      binary_op<bfloat16_t, bool>(a, b, out, op);
-      break;
-    case complex64:
-      binary_op<complex64_t, bool>(a, b, out, op);
-      break;
-  }
+void binary(const array& a, const array& b, array& out, Op op, Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (out.dtype()) {
+      case bool_:
+        binary_op<bool, Op>(a, b, out, bopt);
+        break;
+      case uint8:
+        binary_op<uint8_t, Op>(a, b, out, bopt);
+        break;
+      case uint16:
+        binary_op<uint16_t, Op>(a, b, out, bopt);
+        break;
+      case uint32:
+        binary_op<uint32_t, Op>(a, b, out, bopt);
+        break;
+      case uint64:
+        binary_op<uint64_t, Op>(a, b, out, bopt);
+        break;
+      case int8:
+        binary_op<int8_t, Op>(a, b, out, bopt);
+        break;
+      case int16:
+        binary_op<int16_t, Op>(a, b, out, bopt);
+        break;
+      case int32:
+        binary_op<int32_t, Op>(a, b, out, bopt);
+        break;
+      case int64:
+        binary_op<int64_t, Op>(a, b, out, bopt);
+        break;
+      case float16:
+        binary_op<float16_t, Op>(a, b, out, bopt);
+        break;
+      case float32:
+        binary_op<float, Op>(a, b, out, bopt);
+        break;
+      case float64:
+        binary_op<double, Op>(a, b, out, bopt);
+        break;
+      case bfloat16:
+        binary_op<bfloat16_t, Op>(a, b, out, bopt);
+        break;
+      case complex64:
+        binary_op<complex64_t, Op>(a, b, out, bopt);
+        break;
+    }
+  });
+}
+
+template <typename Op>
+void comparison_op(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (a.dtype()) {
+      case bool_:
+        binary_op<bool, bool, Op>(a, b, out, bopt);
+        break;
+      case uint8:
+        binary_op<uint8_t, bool, Op>(a, b, out, bopt);
+        break;
+      case uint16:
+        binary_op<uint16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case uint32:
+        binary_op<uint32_t, bool, Op>(a, b, out, bopt);
+        break;
+      case uint64:
+        binary_op<uint64_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int8:
+        binary_op<int8_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int16:
+        binary_op<int16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int32:
+        binary_op<int32_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int64:
+        binary_op<int64_t, bool, Op>(a, b, out, bopt);
+        break;
+      case float16:
+        binary_op<float16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case float32:
+        binary_op<float, bool, Op>(a, b, out, bopt);
+        break;
+      case float64:
+        binary_op<double, bool, Op>(a, b, out, bopt);
+        break;
+      case bfloat16:
+        binary_op<bfloat16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case complex64:
+        binary_op<complex64_t, bool, Op>(a, b, out, bopt);
+        break;
+    }
+  });
+}
+
+template <typename Op>
+void binary_float(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (out.dtype()) {
+      case float16:
+        binary_op<float16_t, Op>(a, b, out, bopt);
+        break;
+      case float32:
+        binary_op<float, Op>(a, b, out, bopt);
+        break;
+      case float64:
+        binary_op<double, Op>(a, b, out, bopt);
+        break;
+      case bfloat16:
+        binary_op<bfloat16_t, Op>(a, b, out, bopt);
+        break;
+      case complex64:
+        binary_op<complex64_t, Op>(a, b, out, bopt);
+        break;
+      default:
+        throw std::runtime_error(
+            "[binary_float] Only supports floating point types.");
+    }
+  });
+}
+
+template <typename Op>
+void binary_int(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (out.dtype()) {
+      case bool_:
+        binary_op<bool, Op>(a, b, out, bopt);
+      case uint8:
+        binary_op<uint8_t, Op>(a, b, out, bopt);
+        break;
+      case uint16:
+        binary_op<uint16_t, Op>(a, b, out, bopt);
+        break;
+      case uint32:
+        binary_op<uint32_t, Op>(a, b, out, bopt);
+        break;
+      case uint64:
+        binary_op<uint64_t, Op>(a, b, out, bopt);
+        break;
+      case int8:
+        binary_op<int8_t, Op>(a, b, out, bopt);
+        break;
+      case int16:
+        binary_op<int16_t, Op>(a, b, out, bopt);
+        break;
+      case int32:
+        binary_op<int32_t, Op>(a, b, out, bopt);
+        break;
+      case int64:
+        binary_op<int64_t, Op>(a, b, out, bopt);
+        break;
+      default:
+        throw std::runtime_error("[binary_int] Type not supported");
+        break;
+    }
+  });
 }

 } // namespace
@@ -69,7 +240,7 @@ void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, detail::Add());
+  binary(a, b, out, detail::Add(), stream());
 }

 void DivMod::eval_cpu(
@@ -78,70 +249,89 @@ void DivMod::eval_cpu(
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  auto integral_op = [](auto x, auto y) {
-    return std::make_pair(x / y, x % y);
-  };
-  auto float_op = [](auto x, auto y) {
-    return std::make_pair(std::trunc(x / y), std::fmod(x, y));
-  };
-  switch (outputs[0].dtype()) {
-    case bool_:
-      binary_op<bool>(a, b, outputs, integral_op);
-    case uint8:
-      binary_op<uint8_t>(a, b, outputs, integral_op);
-      break;
-    case uint16:
-      binary_op<uint16_t>(a, b, outputs, integral_op);
-      break;
-    case uint32:
-      binary_op<uint32_t>(a, b, outputs, integral_op);
-      break;
-    case uint64:
-      binary_op<uint64_t>(a, b, outputs, integral_op);
-      break;
-    case int8:
-      binary_op<int8_t>(a, b, outputs, integral_op);
-      break;
-    case int16:
-      binary_op<int16_t>(a, b, outputs, integral_op);
-      break;
-    case int32:
-      binary_op<int32_t>(a, b, outputs, integral_op);
-      break;
-    case int64:
-      binary_op<int64_t>(a, b, outputs, integral_op);
-      break;
-    case float16:
-      binary_op<float16_t>(a, b, outputs, float_op);
-      break;
-    case float32:
-      binary_op<float>(a, b, outputs, float_op);
-      break;
-    case float64:
-      binary_op<double>(a, b, outputs, float_op);
-      break;
-    case bfloat16:
-      binary_op<bfloat16_t>(a, b, outputs, float_op);
-      break;
-    case complex64:
-      // Should never get here
-      throw std::runtime_error("[DivMod] Complex type not supported");
-      break;
-  }
+  auto bopt = get_binary_op_type(a, b);
+  auto& out_a = outputs[0];
+  auto& out_b = outputs[1];
+  set_binary_op_output_data(a, b, out_a, bopt);
+  set_binary_op_output_data(a, b, out_b, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream());
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out_a);
+  encoder.set_output_array(out_b);
+
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out_a = array::unsafe_weak_copy(out_a),
+                    out_b = array::unsafe_weak_copy(out_b),
+                    bopt]() mutable {
+    auto integral_op = [](auto x, auto y) {
+      return std::make_pair(x / y, x % y);
+    };
+    auto float_op = [](auto x, auto y) {
+      return std::make_pair(std::trunc(x / y), std::fmod(x, y));
+    };
+
+    switch (out_a.dtype()) {
+      case bool_:
+        binary_op<bool>(a, b, out_a, out_b, integral_op, bopt);
+      case uint8:
+        binary_op<uint8_t>(a, b, out_a, out_b, integral_op, bopt);
+        break;
+      case uint16:
+        binary_op<uint16_t>(a, b, out_a, out_b, integral_op, bopt);
+        break;
+      case uint32:
+        binary_op<uint32_t>(a, b, out_a, out_b, integral_op, bopt);
+        break;
+      case uint64:
+        binary_op<uint64_t>(a, b, out_a, out_b, integral_op, bopt);
+        break;
+      case int8:
+        binary_op<int8_t>(a, b, out_a, out_b, integral_op, bopt);
+        break;
+      case int16:
+        binary_op<int16_t>(a, b, out_a, out_b, integral_op, bopt);
+        break;
+      case int32:
+        binary_op<int32_t>(a, b, out_a, out_b, integral_op, bopt);
+        break;
+      case int64:
+        binary_op<int64_t>(a, b, out_a, out_b, integral_op, bopt);
+        break;
+      case float16:
+        binary_op<float16_t>(a, b, out_a, out_b, float_op, bopt);
+        break;
+      case float32:
+        binary_op<float>(a, b, out_a, out_b, float_op, bopt);
+        break;
+      case float64:
+        binary_op<double>(a, b, out_a, out_b, float_op, bopt);
+        break;
+      case bfloat16:
+        binary_op<bfloat16_t>(a, b, out_a, out_b, float_op, bopt);
+        break;
+      case complex64:
+        // Should never get here
+        throw std::runtime_error("[DivMod] Complex type not supported");
+        break;
+    }
+  });
 }

 void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, detail::Divide());
+  binary(a, b, out, detail::Divide(), stream());
 }

 void Remainder::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, detail::Remainder());
+  binary(a, b, out, detail::Remainder(), stream());
 }

 void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -149,181 +339,143 @@ void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& a = inputs[0];
  auto& b = inputs[1];
  if (equal_nan_) {
-    switch (a.dtype()) {
-      case float16:
-        binary_op<float16_t, bool>(a, b, out, detail::NaNEqual());
-        break;
-      case float32:
-        binary_op<float, bool>(a, b, out, detail::NaNEqual());
-        break;
-      case float64:
-        binary_op<double, bool>(a, b, out, detail::NaNEqual());
-        break;
-      case bfloat16:
-        binary_op<bfloat16_t, bool>(a, b, out, detail::NaNEqual());
-        break;
-      case complex64:
-        binary_op<complex64_t, bool>(a, b, out, detail::NaNEqual());
-        break;
-      default:
-        throw std::runtime_error(
-            "[NanEqual::eval_cpu] Only for floating point types.");
-    }
+    auto bopt = get_binary_op_type(a, b);
+    set_binary_op_output_data(a, b, out, bopt);
+
+    auto& encoder = cpu::get_command_encoder(stream());
+    encoder.set_input_array(a);
+    encoder.set_input_array(b);
+    encoder.set_output_array(out);
+    encoder.dispatch([a = array::unsafe_weak_copy(a),
+                      b = array::unsafe_weak_copy(b),
+                      out = array::unsafe_weak_copy(out),
+                      bopt]() mutable {
+      switch (a.dtype()) {
+        case float16:
+          binary_op<float16_t, bool, detail::NaNEqual>(a, b, out, bopt);
+          break;
+        case float32:
+          binary_op<float, bool, detail::NaNEqual>(a, b, out, bopt);
+          break;
+        case float64:
+          binary_op<double, bool, detail::NaNEqual>(a, b, out, bopt);
+          break;
+        case bfloat16:
+          binary_op<bfloat16_t, bool, detail::NaNEqual>(a, b, out, bopt);
+          break;
+        case complex64:
+          binary_op<complex64_t, bool, detail::NaNEqual>(a, b, out, bopt);
+          break;
+        default:
+          throw std::runtime_error(
+              "[NanEqual::eval_cpu] Only for floating point types.");
+      }
+    });
  } else {
-    comparison_op(a, b, out, detail::Equal());
+    comparison_op(a, b, out, detail::Equal(), stream());
  }
 }

 void Greater::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op(inputs[0], inputs[1], out, detail::Greater());
+  comparison_op(inputs[0], inputs[1], out, detail::Greater(), stream());
 }

 void GreaterEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op(inputs[0], inputs[1], out, detail::GreaterEqual());
+  comparison_op(inputs[0], inputs[1], out, detail::GreaterEqual(), stream());
 }

 void Less::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op(inputs[0], inputs[1], out, detail::Less());
+  comparison_op(inputs[0], inputs[1], out, detail::Less(), stream());
 }

 void LessEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op(inputs[0], inputs[1], out, detail::LessEqual());
+  comparison_op(inputs[0], inputs[1], out, detail::LessEqual(), stream());
 }

 void LogAddExp::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  switch (out.dtype()) {
-    case float16:
-      binary_op<float16_t>(a, b, out, detail::LogAddExp());
-      break;
-    case float32:
-      binary_op<float>(a, b, out, detail::LogAddExp());
-      break;
-    case float64:
-      binary_op<double>(a, b, out, detail::LogAddExp());
-      break;
-    case bfloat16:
-      binary_op<bfloat16_t>(a, b, out, detail::LogAddExp());
-      break;
-    default:
-      throw std::runtime_error(
-          "[LogAddExp::eval_cpu] Only supports non-complex floating point types.");
-  }
+  binary_float(a, b, out, detail::LogAddExp(), stream());
 }

 void LogicalAnd::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
-  binary(in1, in2, out, detail::LogicalAnd());
+  binary(in1, in2, out, detail::LogicalAnd(), stream());
 }

 void LogicalOr::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalOr requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
-  binary(in1, in2, out, detail::LogicalOr());
+  binary(in1, in2, out, detail::LogicalOr(), stream());
 }

 void Maximum::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, detail::Maximum());
+  binary(a, b, out, detail::Maximum(), stream());
 }

 void Minimum::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, detail::Minimum());
+  binary(a, b, out, detail::Minimum(), stream());
 }

 void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, detail::Multiply());
+  binary(a, b, out, detail::Multiply(), stream());
 }

 void NotEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op(inputs[0], inputs[1], out, detail::NotEqual());
+  comparison_op(inputs[0], inputs[1], out, detail::NotEqual(), stream());
 }

 void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, detail::Power());
+  binary(a, b, out, detail::Power(), stream());
 }

 void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, detail::Subtract());
+  binary(a, b, out, detail::Subtract(), stream());
 }

 void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  auto dispatch_type = [&a, &b, &out](auto op) {
-    switch (out.dtype()) {
-      case bool_:
-        binary_op<bool>(a, b, out, op);
-      case uint8:
-        binary_op<uint8_t>(a, b, out, op);
-        break;
-      case uint16:
-        binary_op<uint16_t>(a, b, out, op);
-        break;
-      case uint32:
-        binary_op<uint32_t>(a, b, out, op);
-        break;
-      case uint64:
-        binary_op<uint64_t>(a, b, out, op);
-        break;
-      case int8:
-        binary_op<int8_t>(a, b, out, op);
-        break;
-      case int16:
-        binary_op<int16_t>(a, b, out, op);
-        break;
-      case int32:
-        binary_op<int32_t>(a, b, out, op);
-        break;
-      case int64:
-        binary_op<int64_t>(a, b, out, op);
-        break;
-      default:
-        throw std::runtime_error(
-            "[BitwiseBinary::eval_cpu] Type not supported");
-        break;
-    }
-  };
  switch (op_) {
    case BitwiseBinary::And:
-      dispatch_type(detail::BitwiseAnd());
+      binary_int(a, b, out, detail::BitwiseAnd(), stream());
      break;
    case BitwiseBinary::Or:
-      dispatch_type(detail::BitwiseOr());
+      binary_int(a, b, out, detail::BitwiseOr(), stream());
      break;
    case BitwiseBinary::Xor:
-      dispatch_type(detail::BitwiseXor());
+      binary_int(a, b, out, detail::BitwiseXor(), stream());
      break;
    case BitwiseBinary::LeftShift:
-      dispatch_type(detail::LeftShift());
+      binary_int(a, b, out, detail::LeftShift(), stream());
      break;
    case BitwiseBinary::RightShift:
-      dispatch_type(detail::RightShift());
+      binary_int(a, b, out, detail::RightShift(), stream());
      break;
  }
 }
@@ -332,23 +484,7 @@ void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  const auto& a = inputs[0];
  const auto& b = inputs[1];
-  switch (out.dtype()) {
-    case float16:
-      binary_op<float16_t>(a, b, out, detail::ArcTan2());
-      break;
-    case float32:
-      binary_op<float>(a, b, out, detail::ArcTan2());
-      break;
-    case float64:
-      binary_op<double>(a, b, out, detail::ArcTan2());
-      break;
-    case bfloat16:
-      binary_op<bfloat16_t>(a, b, out, detail::ArcTan2());
-      break;
-    default:
-      throw std::runtime_error(
-          "[ArcTan2::eval_cpu] Only supports non-complex floating point types.");
-  }
+  binary_float(a, b, out, detail::ArcTan2(), stream());
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/binary.h
+++ b/mlx/backend/cpu/binary.h
@@ -3,7 +3,6 @@
 #pragma once
 #include <cassert>

-#include "mlx/allocator.h"
 #include "mlx/array.h"
 #include "mlx/backend/common/binary.h"
 #include "mlx/backend/common/utils.h"
@@ -14,22 +13,18 @@ namespace mlx::core {

 template <typename Op>
 struct VectorScalar {
-  Op op;
-
-  VectorScalar(Op op_) : op(op_) {}
-
  template <typename T, typename U>
  void operator()(const T* a, const T* b, U* dst, int size) {
    T scalar = *b;
    constexpr int N = simd::max_size<T>;
    while (size >= N) {
-      simd::store(dst, op(simd::load<T, N>(a), simd::Simd<T, N>(scalar)));
+      simd::store(dst, Op{}(simd::load<T, N>(a), simd::Simd<T, N>(scalar)));
      dst += N;
      a += N;
      size -= N;
    }
    while (size-- > 0) {
-      *dst = op(*a, scalar);
+      *dst = Op{}(*a, scalar);
      dst++;
      a++;
    }
@@ -38,22 +33,18 @@ struct VectorScalar {

 template <typename Op>
 struct ScalarVector {
-  Op op;
-
-  ScalarVector(Op op_) : op(op_) {}
-
  template <typename T, typename U>
  void operator()(const T* a, const T* b, U* dst, int size) {
    T scalar = *a;
    constexpr int N = simd::max_size<T>;
    while (size >= N) {
-      simd::store(dst, op(simd::Simd<T, N>(scalar), simd::load<T, N>(b)));
+      simd::store(dst, Op{}(simd::Simd<T, N>(scalar), simd::load<T, N>(b)));
      dst += N;
      b += N;
      size -= N;
    }
    while (size-- > 0) {
-      *dst = op(scalar, *b);
+      *dst = Op{}(scalar, *b);
      dst++;
      b++;
    }
@@ -62,22 +53,18 @@ struct ScalarVector {

 template <typename Op>
 struct VectorVector {
-  Op op;
-
-  VectorVector(Op op_) : op(op_) {}
-
  template <typename T, typename U>
  void operator()(const T* a, const T* b, U* dst, int size) {
    constexpr int N = simd::max_size<T>;
    while (size >= N) {
-      simd::store(dst, op(simd::load<T, N>(a), simd::load<T, N>(b)));
+      simd::store(dst, Op{}(simd::load<T, N>(a), simd::load<T, N>(b)));
      dst += N;
      a += N;
      b += N;
      size -= N;
    }
    while (size-- > 0) {
-      *dst = op(*a, *b);
+      *dst = Op{}(*a, *b);
      dst++;
      a++;
      b++;
@@ -90,7 +77,6 @@ void binary_op_dims(
    const T* a,
    const T* b,
    U* out,
-    Op op,
    const Shape& shape,
    const Strides& a_strides,
    const Strides& b_strides,
@@ -104,12 +90,12 @@ void binary_op_dims(
  for (int i = 0; i < N; i++) {
    if constexpr (D > 1) {
      binary_op_dims<T, U, Op, D - 1, Strided>(
-          a, b, out, op, shape, a_strides, b_strides, out_strides, axis + 1);
+          a, b, out, shape, a_strides, b_strides, out_strides, axis + 1);
    } else {
      if constexpr (Strided) {
-        op(a, b, out, stride_out);
+        Op{}(a, b, out, stride_out);
      } else {
-        *out = op(*a, *b);
+        *out = Op{}(*a, *b);
      }
    }
    out += stride_out;
@@ -120,66 +106,38 @@ void binary_op_dims(

 template <typename T, typename U, bool Strided, typename Op>
 void binary_op_dispatch_dims(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
+    const T* a,
+    const T* b,
+    U* out,
    int dim,
+    int size,
    const Shape& shape,
    const Strides& a_strides,
    const Strides& b_strides,
    const Strides& out_strides) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* out_ptr = out.data<U>();
  switch (dim) {
    case 1:
      binary_op_dims<T, U, Op, 1, Strided>(
-          a_ptr,
-          b_ptr,
-          out_ptr,
-          op,
-          shape,
-          a_strides,
-          b_strides,
-          out_strides,
-          0);
+          a, b, out, shape, a_strides, b_strides, out_strides, 0);
      return;
    case 2:
      binary_op_dims<T, U, Op, 2, Strided>(
-          a_ptr,
-          b_ptr,
-          out_ptr,
-          op,
-          shape,
-          a_strides,
-          b_strides,
-          out_strides,
-          0);
+          a, b, out, shape, a_strides, b_strides, out_strides, 0);
      return;
    case 3:
      binary_op_dims<T, U, Op, 3, Strided>(
-          a_ptr,
-          b_ptr,
-          out_ptr,
-          op,
-          shape,
-          a_strides,
-          b_strides,
-          out_strides,
-          0);
+          a, b, out, shape, a_strides, b_strides, out_strides, 0);
      return;
  }

  ContiguousIterator a_it(shape, a_strides, dim - 3);
  ContiguousIterator b_it(shape, b_strides, dim - 3);
  auto stride = out_strides[dim - 4];
-  for (int64_t elem = 0; elem < a.size(); elem += stride) {
+  for (int64_t elem = 0; elem < size; elem += stride) {
    binary_op_dims<T, U, Op, 3, Strided>(
-        a_ptr + a_it.loc,
-        b_ptr + b_it.loc,
-        out_ptr + elem,
-        op,
+        a + a_it.loc,
+        b + b_it.loc,
+        out + elem,
        shape,
        a_strides,
        b_strides,
@@ -191,40 +149,41 @@ void binary_op_dispatch_dims(
 }

 template <typename T, typename U, typename Op>
-void binary_op(const array& a, const array& b, array& out, Op op) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
+void binary_op(const array& a, const array& b, array& out, BinaryOpType bopt) {
  // The full computation is scalar scalar so call the base op once
+  auto a_ptr = a.data<T>();
+  auto b_ptr = b.data<T>();
+
+  auto out_ptr = out.data<U>();
  if (bopt == BinaryOpType::ScalarScalar) {
-    *(out.data<U>()) = op(*a.data<T>(), *b.data<T>());
+    *out_ptr = Op{}(*a_ptr, *b_ptr);
    return;
  }

  // The full computation is scalar vector so delegate to the op
  if (bopt == BinaryOpType::ScalarVector) {
-    ScalarVector{op}(a.data<T>(), b.data<T>(), out.data<U>(), b.data_size());
+    ScalarVector<Op>{}(a_ptr, b_ptr, out_ptr, b.data_size());
    return;
  }

  // The full computation is vector scalar so delegate to the op
  if (bopt == BinaryOpType::VectorScalar) {
-    VectorScalar{op}(a.data<T>(), b.data<T>(), out.data<U>(), a.data_size());
+    VectorScalar<Op>{}(a_ptr, b_ptr, out_ptr, a.data_size());
    return;
  }

  // The full computation is vector vector so delegate to the op
  if (bopt == BinaryOpType::VectorVector) {
-    VectorVector{op}(a.data<T>(), b.data<T>(), out.data<U>(), out.size());
+    VectorVector<Op>{}(a_ptr, b_ptr, out_ptr, a.size());
    return;
  }

  // General computation so let's try to optimize
  auto [new_shape, new_strides] = collapse_contiguous_dims(
      a.shape(), {a.strides(), b.strides(), out.strides()});
-  const auto& a_strides = new_strides[0];
-  const auto& b_strides = new_strides[1];
-  const auto& strides = new_strides[2];
+  auto& a_strides = new_strides[0];
+  auto& b_strides = new_strides[1];
+  auto& strides = new_strides[2];

  // Get the left-most dim such that the array is row contiguous after
  auto leftmost_rc_dim = [&strides](const auto& arr_strides) {
@@ -248,7 +207,8 @@ void binary_op(const array& a, const array& b, array& out, Op op) {

  auto ndim = new_shape.size();

-  // Case 1: LxM and FxM where L and F are broadcastable and M is row contiguous
+  // Case 1: LxM and FxM where L and F are broadcastable and M is row
+  // contiguous
  int dim = ndim;
  if (int d = std::max(a_rc_dim, b_rc_dim); d < ndim) {
    bopt = BinaryOpType::VectorVector;
@@ -275,99 +235,59 @@ void binary_op(const array& a, const array& b, array& out, Op op) {

  switch (bopt) {
    case BinaryOpType::VectorVector:
-      binary_op_dispatch_dims<T, U, true>(
-          a,
-          b,
-          out,
-          VectorVector{op},
+      binary_op_dispatch_dims<T, U, true, VectorVector<Op>>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
          dim,
+          a.size(),
          new_shape,
          a_strides,
          b_strides,
          strides);
      break;
    case BinaryOpType::VectorScalar:
-      binary_op_dispatch_dims<T, U, true>(
-          a,
-          b,
-          out,
-          VectorScalar{op},
+      binary_op_dispatch_dims<T, U, true, VectorScalar<Op>>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
          dim,
+          a.size(),
          new_shape,
          a_strides,
          b_strides,
          strides);
      break;
    case BinaryOpType::ScalarVector:
-      binary_op_dispatch_dims<T, U, true>(
-          a,
-          b,
-          out,
-          ScalarVector{op},
+      binary_op_dispatch_dims<T, U, true, ScalarVector<Op>>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
          dim,
+          a.size(),
          new_shape,
          a_strides,
          b_strides,
          strides);
      break;
    default:
-      binary_op_dispatch_dims<T, U, false>(
-          a, b, out, op, dim, new_shape, a_strides, b_strides, strides);
+      binary_op_dispatch_dims<T, U, false, Op>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
+          dim,
+          a.size(),
+          new_shape,
+          a_strides,
+          b_strides,
+          strides);
      break;
  }
 }

 template <typename T, typename Op>
-void binary_op(const array& a, const array& b, array& out, Op op) {
-  binary_op<T, T>(a, b, out, op);
-}
-
-template <typename Op>
-void binary(const array& a, const array& b, array& out, Op op) {
-  switch (out.dtype()) {
-    case bool_:
-      binary_op<bool>(a, b, out, op);
-      break;
-    case uint8:
-      binary_op<uint8_t>(a, b, out, op);
-      break;
-    case uint16:
-      binary_op<uint16_t>(a, b, out, op);
-      break;
-    case uint32:
-      binary_op<uint32_t>(a, b, out, op);
-      break;
-    case uint64:
-      binary_op<uint64_t>(a, b, out, op);
-      break;
-    case int8:
-      binary_op<int8_t>(a, b, out, op);
-      break;
-    case int16:
-      binary_op<int16_t>(a, b, out, op);
-      break;
-    case int32:
-      binary_op<int32_t>(a, b, out, op);
-      break;
-    case int64:
-      binary_op<int64_t>(a, b, out, op);
-      break;
-    case float16:
-      binary_op<float16_t>(a, b, out, op);
-      break;
-    case float32:
-      binary_op<float>(a, b, out, op);
-      break;
-    case float64:
-      binary_op<double>(a, b, out, op);
-      break;
-    case bfloat16:
-      binary_op<bfloat16_t>(a, b, out, op);
-      break;
-    case complex64:
-      binary_op<complex64_t>(a, b, out, op);
-      break;
-  }
+void binary_op(const array& a, const array& b, array& out, BinaryOpType bopt) {
+  binary_op<T, T, Op>(a, b, out, bopt);
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/binary_two.h
+++ b/mlx/backend/cpu/binary_two.h
@@ -58,14 +58,14 @@ void binary_op_dispatch_dims(
    Op op) {
  auto [shape, strides] = collapse_contiguous_dims(
      a.shape(), {a.strides(), b.strides(), out_a.strides()});
-  const auto& a_strides = strides[0];
-  const auto& b_strides = strides[1];
-  const auto& out_strides = strides[2];
  const T* a_ptr = a.data<T>();
  const T* b_ptr = b.data<T>();
  U* out_a_ptr = out_a.data<U>();
  U* out_b_ptr = out_b.data<U>();

+  const auto& a_strides = strides[0];
+  const auto& b_strides = strides[1];
+  const auto& out_strides = strides[2];
  int ndim = shape.size();
  switch (ndim) {
    case 1:
@@ -120,14 +120,10 @@ template <typename T, typename U = T, typename Op>
 void binary_op(
    const array& a,
    const array& b,
-    std::vector<array>& outputs,
-    Op op) {
-  auto bopt = get_binary_op_type(a, b);
-  auto& out_a = outputs[0];
-  auto& out_b = outputs[1];
-  set_binary_op_output_data(a, b, out_a, bopt);
-  set_binary_op_output_data(a, b, out_b, bopt);
-
+    array& out_a,
+    array& out_b,
+    Op op,
+    BinaryOpType bopt) {
  // The full computation is scalar scalar so call the base op once
  if (bopt == BinaryOpType::General) {
    binary_op_dispatch_dims<T, U, Op>(a, b, out_a, out_b, op);
@@ -141,14 +137,14 @@ void binary_op(
  if (bopt == BinaryOpType::ScalarScalar) {
    std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
  } else if (bopt == BinaryOpType::ScalarVector) {
-    for (size_t i = 0; i < b.size(); ++i) {
+    for (size_t i = 0; i < b.data_size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
      b_ptr++;
    }
  } else if (bopt == BinaryOpType::VectorScalar) {
-    for (size_t i = 0; i < a.size(); ++i) {
+    for (size_t i = 0; i < a.data_size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
@@ -165,58 +161,6 @@ void binary_op(
  }
 }

-template <typename Op>
-void binary(
-    const array& a,
-    const array& b,
-    std::vector<array>& outputs,
-    Op op) {
-  switch (outputs[0].dtype()) {
-    case bool_:
-      binary_op<bool>(a, b, outputs, op);
-      break;
-    case uint8:
-      binary_op<uint8_t>(a, b, outputs, op);
-      break;
-    case uint16:
-      binary_op<uint16_t>(a, b, outputs, op);
-      break;
-    case uint32:
-      binary_op<uint32_t>(a, b, outputs, op);
-      break;
-    case uint64:
-      binary_op<uint64_t>(a, b, outputs, op);
-      break;
-    case int8:
-      binary_op<int8_t>(a, b, outputs, op);
-      break;
-    case int16:
-      binary_op<int16_t>(a, b, outputs, op);
-      break;
-    case int32:
-      binary_op<int32_t>(a, b, outputs, op);
-      break;
-    case int64:
-      binary_op<int64_t>(a, b, outputs, op);
-      break;
-    case float16:
-      binary_op<float16_t>(a, b, outputs, op);
-      break;
-    case float32:
-      binary_op<float>(a, b, outputs, op);
-      break;
-    case float64:
-      binary_op<double>(a, b, outputs, op);
-      break;
-    case bfloat16:
-      binary_op<bfloat16_t>(a, b, outputs, op);
-      break;
-    case complex64:
-      binary_op<complex64_t>(a, b, outputs, op);
-      break;
-  }
-}
-
 } // namespace

 } // namespace mlx::core
--- a/mlx/backend/cpu/cholesky.cpp
+++ b/mlx/backend/cpu/cholesky.cpp
@@ -2,6 +2,7 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/lapack.h"
 #include "mlx/linalg.h"
 #include "mlx/primitives.h"
@@ -9,7 +10,7 @@
 namespace mlx::core {

 template <typename T>
-void cholesky_impl(const array& a, array& factor, bool upper) {
+void cholesky_impl(const array& a, array& factor, bool upper, Stream stream) {
  // Lapack uses the column-major convention. We take advantage of the fact that
  // the matrix should be symmetric:
  //   (A)ᵀ = A
@@ -17,60 +18,63 @@ void cholesky_impl(const array& a, array& factor, bool upper) {
  // triangular matrix, so uplo is the opposite of what we would expect from
  // upper

-  char uplo = (upper) ? 'L' : 'U';
-
  // The decomposition is computed in place, so just copy the input to the
  // output.
-  copy(
+  copy_cpu(
      a,
      factor,
-      a.flags().row_contiguous ? CopyType::Vector : CopyType::General);
+      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
+      stream);

-  const int N = a.shape(-1);
-  const size_t num_matrices = a.size() / (N * N);
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_output_array(factor);
+  encoder.dispatch([matrix = factor.data<T>(),
+                    upper,
+                    N = a.shape(-1),
+                    size = a.size()]() mutable {
+    char uplo = (upper) ? 'L' : 'U';
+    size_t num_matrices = size / (N * N);
+    for (int i = 0; i < num_matrices; i++) {
+      // Compute Cholesky factorization.
+      int info;
+      potrf<T>(
+          /* uplo = */ &uplo,
+          /* n = */ &N,
+          /* a = */ matrix,
+          /* lda = */ &N,
+          /* info = */ &info);

-  T* matrix = factor.data<T>();
-
-  for (int i = 0; i < num_matrices; i++) {
-    // Compute Cholesky factorization.
-    int info;
-    potrf<T>(
-        /* uplo = */ &uplo,
-        /* n = */ &N,
-        /* a = */ matrix,
-        /* lda = */ &N,
-        /* info = */ &info);
-
-    // TODO: We do nothing when the matrix is not positive semi-definite
-    // because throwing an error would result in a crash. If we figure out how
-    // to catch errors from the implementation we should throw.
-    if (info < 0) {
-      std::stringstream msg;
-      msg << "[cholesky] Cholesky decomposition failed with error code "
-          << info;
-      throw std::runtime_error(msg.str());
-    }
-
-    // Zero out the upper/lower triangle while advancing the pointer to the
-    // next matrix at the same time.
-    for (int row = 0; row < N; row++) {
-      if (upper) {
-        std::fill(matrix, matrix + row, 0);
-      } else {
-        std::fill(matrix + row + 1, matrix + N, 0);
+      // TODO: We do nothing when the matrix is not positive semi-definite
+      // because throwing an error would result in a crash. If we figure out how
+      // to catch errors from the implementation we should throw.
+      if (info < 0) {
+        std::stringstream msg;
+        msg << "[Cholesky::eval_cpu] Cholesky decomposition failed with error code "
+            << info;
+        throw std::runtime_error(msg.str());
+      }
+
+      // Zero out the upper/lower triangle while advancing the pointer to the
+      // next matrix at the same time.
+      for (int row = 0; row < N; row++) {
+        if (upper) {
+          std::fill(matrix, matrix + row, 0);
+        } else {
+          std::fill(matrix + row + 1, matrix + N, 0);
+        }
+        matrix += N;
      }
-      matrix += N;
    }
-  }
+  });
 }

 void Cholesky::eval_cpu(const std::vector<array>& inputs, array& output) {
  switch (inputs[0].dtype()) {
    case float32:
-      cholesky_impl<float>(inputs[0], output, upper_);
+      cholesky_impl<float>(inputs[0], output, upper_, stream());
      break;
    case float64:
-      cholesky_impl<double>(inputs[0], output, upper_);
+      cholesky_impl<double>(inputs[0], output, upper_, stream());
      break;
    default:
      throw std::runtime_error(
--- a/mlx/backend/cpu/compiled.cpp
+++ b/mlx/backend/cpu/compiled.cpp
@@ -11,6 +11,7 @@

 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/cpu/compiled_preamble.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/jit_compiler.h"
 #include "mlx/device.h"
 #include "mlx/graph_utils.h"
@@ -39,7 +40,10 @@ struct CompilerCache {
  std::shared_mutex mtx;
 };

-static CompilerCache cache{};
+static CompilerCache& cache() {
+  static CompilerCache cache_;
+  return cache_;
+};

 // GPU compile is always available if the GPU is available and since we are in
 // this file CPU compile is also available.
@@ -55,14 +59,16 @@ void* compile(
    const std::string& kernel_name,
    const std::function<std::string(void)>& source_builder) {
  {
-    std::shared_lock lock(cache.mtx);
-    if (auto it = cache.kernels.find(kernel_name); it != cache.kernels.end()) {
+    std::shared_lock lock(cache().mtx);
+    if (auto it = cache().kernels.find(kernel_name);
+        it != cache().kernels.end()) {
      return it->second;
    }
  }

-  std::unique_lock lock(cache.mtx);
-  if (auto it = cache.kernels.find(kernel_name); it != cache.kernels.end()) {
+  std::unique_lock lock(cache().mtx);
+  if (auto it = cache().kernels.find(kernel_name);
+      it != cache().kernels.end()) {
    return it->second;
  }
  std::string source_code = source_builder();
@@ -119,10 +125,10 @@ void* compile(
  }

  // load library
-  cache.libs.emplace_back(shared_lib_path);
+  cache().libs.emplace_back(shared_lib_path);

  // Load function
-  void* fun = dlsym(cache.libs.back().lib, kernel_name.c_str());
+  void* fun = dlsym(cache().libs.back().lib, kernel_name.c_str());
  if (!fun) {
    std::ostringstream msg;
    msg << "[Compile::eval_cpu] Failed to load compiled function "
@@ -130,7 +136,7 @@ void* compile(
        << dlerror();
    throw std::runtime_error(msg.str());
  }
-  cache.kernels.insert({kernel_name, fun});
+  cache().kernels.insert({kernel_name, fun});
  return fun;
 }

@@ -140,18 +146,9 @@ inline void build_kernel(
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids,
+    const std::function<bool(size_t)>& is_constant,
    bool contiguous,
    int ndim) {
-  // All outputs should have the exact same shape and will be row contiguous
-  auto output_shape = outputs[0].shape();
-  auto output_strides = outputs[0].strides();
-
-  // Constants are scalars that are captured by value and cannot change
-  auto is_constant = [&constant_ids](const array& x) {
-    return constant_ids.find(x.id()) != constant_ids.end();
-  };
-
  NodeNamer namer;

 #ifdef _MSC_VER
@@ -164,14 +161,15 @@ inline void build_kernel(

  // Add the input arguments
  int cnt = 0;
-  for (auto& x : inputs) {
-    auto& xname = namer.get_name(x);
-
+  for (size_t i = 0; i < inputs.size(); ++i) {
    // Skip constants from the input list
-    if (is_constant(x)) {
+    if (is_constant(i)) {
      continue;
    }

+    const auto& x = inputs[i];
+    auto& xname = namer.get_name(x);
+
    auto tstr = get_type_string(x.dtype());
    os << "  " << tstr << "* " << xname << " = (" << tstr << "*)args[" << cnt++
       << "];" << std::endl;
@@ -205,10 +203,11 @@ inline void build_kernel(
  }

  // Read the inputs in tmps
-  for (auto& x : inputs) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const auto& x = inputs[i];
    auto& xname = namer.get_name(x);

-    if (is_constant(x)) {
+    if (is_constant(i)) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = ";
      print_constant(os, x);
      os << ";" << std::endl;
@@ -232,7 +231,7 @@ inline void build_kernel(
      os << "static_cast<" << get_type_string(x.dtype()) << ">(tmp_"
         << namer.get_name(x.inputs()[0]) << ");" << std::endl;
    } else {
-      x.primitive().print(os);
+      os << x.primitive().name();
      os << "()(";
      for (int i = 0; i < x.inputs().size() - 1; i++) {
        os << "tmp_" << namer.get_name(x.inputs()[i]) << ", ";
@@ -258,8 +257,9 @@ inline void build_kernel(
  } else {
    for (int d = ndim - 1; d >= 0; --d) {
      // Update pointers
-      for (auto& x : inputs) {
-        if (is_constant(x) || is_scalar(x)) {
+      for (size_t i = 0; i < inputs.size(); ++i) {
+        const auto& x = inputs[i];
+        if (is_constant(i) || is_scalar(x)) {
          continue;
        }
        auto& xname = namer.get_name(x);
@@ -281,63 +281,45 @@ inline void build_kernel(
 void Compiled::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
-  if (kernel_lib_.empty()) {
-    kernel_lib_ = build_lib_name(inputs_, outputs_, tape_, constant_ids_);
+  auto& encoder = cpu::get_command_encoder(stream());
+
+  // Collapse contiguous dims to route to a faster kernel if possible. Also
+  // handle all broadcasting.
+  auto [contiguous, shape, strides] =
+      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);
+
+  // Force allocating shape/strides on heap so we can take their data() first
+  // and then std::move them.
+  // TODO: Refactor code to avoid heap allocation.
+  shape.grow();
+  for (auto& s : strides) {
+    s.grow();
  }

-  // Figure out which kernel we are using
-  auto& shape = outputs[0].shape();
-  auto contiguous = compiled_check_contiguity(inputs, shape);
-
-  // Handle all broadcasting and collect function input arguments
+  // Collect function input arguments.
  std::vector<void*> args;
-  std::vector<std::vector<size_t>> strides;
-  for (int i = 0; i < inputs.size(); i++) {
-    // Skip constants.
-    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
+  int strides_index = 1;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (is_constant_(i)) {
      continue;
    }
-    auto& x = inputs[i];
+    const auto& x = inputs[i];
+    encoder.set_input_array(x);
    args.push_back((void*)x.data<void>());
-
-    if (contiguous || is_scalar(x)) {
-      continue;
+    if (!contiguous && !is_scalar(x)) {
+      args.push_back(strides[strides_index++].data());
    }
-
-    // Broadcast the input to the output shape.
-    std::vector<size_t> xstrides;
-    int j = 0;
-    for (; j < shape.size() - x.ndim(); j++) {
-      if (shape[j] == 1) {
-        xstrides.push_back(outputs[0].strides()[j]);
-      } else {
-        xstrides.push_back(0);
-      }
-    }
-    for (int i = 0; i < x.ndim(); i++, j++) {
-      if (x.shape(i) == 1) {
-        if (shape[j] == 1) {
-          xstrides.push_back(outputs[0].strides()[j]);
-        } else {
-          xstrides.push_back(0);
-        }
-      } else {
-        xstrides.push_back(x.strides()[i]);
-      }
-    }
-    strides.push_back(std::move(xstrides));
-    args.push_back(strides.back().data());
  }

  // Get the kernel name from the lib
  int ndim = shape.size();
  auto kernel_name = kernel_lib_ + (contiguous ? "_contiguous" : "_strided_");
  if (!contiguous) {
-    kernel_name += std::to_string(shape.size());
+    kernel_name += std::to_string(ndim);
  }

  // Get the function
-  auto fn_ptr = compile(kernel_name, [&]() {
+  auto fn_ptr = compile(kernel_name, [&, contiguous = contiguous]() {
    std::ostringstream kernel;
    kernel << get_kernel_preamble() << std::endl;
    kernel << "extern \"C\"  {" << std::endl;
@@ -347,7 +329,7 @@ void Compiled::eval_cpu(
        inputs_,
        outputs_,
        tape_,
-        constant_ids_,
+        is_constant_,
        contiguous,
        ndim);
    // Close extern "C"
@@ -355,19 +337,22 @@ void Compiled::eval_cpu(
    return kernel.str();
  });

-  compiled_allocate_outputs(
-      inputs, outputs, inputs_, constant_ids_, contiguous, false);
+  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);

  for (auto& x : outputs) {
    args.push_back(x.data<void>());
+    encoder.set_output_array(x);
  }
  if (!contiguous) {
-    args.push_back((void*)outputs[0].shape().data());
+    args.push_back((void*)shape.data());
  } else {
    args.push_back((void*)outputs[0].data_size());
  }
  auto fun = (void (*)(void**))fn_ptr;
-  fun(args.data());
+  encoder.dispatch([fun,
+                    args = std::move(args),
+                    strides = std::move(strides),
+                    shape = std::move(shape)]() mutable { fun(args.data()); });
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/conv.cpp
+++ b/mlx/backend/cpu/conv.cpp
--- a/mlx/backend/cpu/copy.cpp
+++ b/mlx/backend/cpu/copy.cpp
@@ -5,6 +5,7 @@
 #include "mlx/allocator.h"
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"

 namespace mlx::core {
@@ -13,19 +14,19 @@ namespace {

 template <typename SrcT, typename DstT>
 void copy_single(const array& src, array& dst) {
-  auto val = static_cast<DstT>(src.data<SrcT>()[0]);
+  auto src_ptr = src.data<SrcT>();
  auto dst_ptr = dst.data<DstT>();
-  for (int i = 0; i < dst.size(); ++i) {
-    dst_ptr[i] = val;
-  }
+  auto size = dst.size();
+  auto val = static_cast<DstT>(src_ptr[0]);
+  std::fill_n(dst_ptr, size, val);
 }

 template <typename SrcT, typename DstT>
 void copy_vector(const array& src, array& dst) {
  auto src_ptr = src.data<SrcT>();
  auto dst_ptr = dst.data<DstT>();
-  size_t size = src.data_size();
-  std::copy(src_ptr, src_ptr + src.data_size(), dst_ptr);
+  auto size = src.data_size();
+  std::copy(src_ptr, src_ptr + size, dst_ptr);
 }

 template <typename SrcT, typename DstT, int D>
@@ -60,36 +61,57 @@ void copy_general_general(
    const Strides& i_strides,
    const Strides& o_strides,
    int64_t i_offset,
-    int64_t o_offset) {
+    int64_t o_offset,
+    const std::optional<array>& dynamic_i_offset,
+    const std::optional<array>& dynamic_o_offset) {
+  auto src_ptr = src.data<SrcT>() + i_offset;
+  auto dst_ptr = dst.data<DstT>() + o_offset;
+  auto i_offset_ptr =
+      dynamic_i_offset ? dynamic_i_offset->data<int64_t>() : nullptr;
+  auto o_offset_ptr =
+      dynamic_o_offset ? dynamic_o_offset->data<int64_t>() : nullptr;
+  auto size = src.size();
  if (data_shape.empty()) {
-    auto val = static_cast<DstT>(*(src.data<SrcT>() + i_offset));
-    auto dst_ptr = dst.data<DstT>() + o_offset;
+    auto val = static_cast<DstT>(*src_ptr);
    *dst_ptr = val;
    return;
  }
  auto [shape, strides] =
      collapse_contiguous_dims(data_shape, {i_strides, o_strides});
-  auto src_ptr = src.data<SrcT>() + i_offset;
-  auto dst_ptr = dst.data<DstT>() + o_offset;
+
  int ndim = shape.size();
-  if (ndim == 1) {
-    copy_dims<SrcT, DstT, 1>(
-        src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
-    return;
-  } else if (ndim == 2) {
-    copy_dims<SrcT, DstT, 2>(
-        src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
-    return;
-  } else if (ndim == 3) {
-    copy_dims<SrcT, DstT, 3>(
-        src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
+  if (ndim < 3) {
+    if (i_offset_ptr) {
+      src_ptr += i_offset_ptr[0];
+    }
+    if (o_offset_ptr) {
+      dst_ptr += o_offset_ptr[0];
+    }
+
+    if (ndim == 1) {
+      copy_dims<SrcT, DstT, 1>(
+          src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
+    } else if (ndim == 2) {
+      copy_dims<SrcT, DstT, 2>(
+          src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
+    } else if (ndim == 3) {
+      copy_dims<SrcT, DstT, 3>(
+          src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
+    }
    return;
  }
+  if (i_offset_ptr) {
+    src_ptr += i_offset_ptr[0];
+  }
+  if (o_offset_ptr) {
+    dst_ptr += o_offset_ptr[0];
+  }
+
  ContiguousIterator in(shape, strides[0], ndim - 3);
  ContiguousIterator out(shape, strides[1], ndim - 3);
  auto stride = std::accumulate(
      shape.end() - 3, shape.end(), 1, std::multiplies<int64_t>());
-  for (int64_t elem = 0; elem < src.size(); elem += stride) {
+  for (int64_t elem = 0; elem < size; elem += stride) {
    copy_dims<SrcT, DstT, 3>(
        src_ptr + in.loc,
        dst_ptr + out.loc,
@@ -105,7 +127,15 @@ void copy_general_general(
 template <typename SrcT, typename DstT>
 inline void copy_general_general(const array& src, array& dst) {
  copy_general_general<SrcT, DstT>(
-      src, dst, src.shape(), src.strides(), dst.strides(), 0, 0);
+      src,
+      dst,
+      src.shape(),
+      src.strides(),
+      dst.strides(),
+      0,
+      0,
+      std::nullopt,
+      std::nullopt);
 }

 template <typename SrcT, typename DstT>
@@ -116,7 +146,9 @@ void copy_general(
    const Strides& i_strides,
    const Strides&,
    int64_t i_offset,
-    int64_t o_offset) {
+    int64_t o_offset,
+    const std::optional<array>& dynamic_i_offset,
+    const std::optional<array>& dynamic_o_offset) {
  copy_general_general<SrcT, DstT>(
      src,
      dst,
@@ -124,7 +156,9 @@ void copy_general(
      i_strides,
      make_contiguous_strides(data_shape),
      i_offset,
-      o_offset);
+      o_offset,
+      dynamic_i_offset,
+      dynamic_o_offset);
 }

 template <typename SrcT, typename DstT>
@@ -136,7 +170,9 @@ inline void copy_general(const array& src, array& dst) {
      src.strides(),
      make_contiguous_strides(src.shape()),
      0,
-      0);
+      0,
+      std::nullopt,
+      std::nullopt);
 }

 template <typename SrcT, typename DstT, typename... Args>
@@ -259,38 +295,34 @@ inline void copy_inplace_dispatch(

 } // namespace

-void copy_inplace(const array& src, array& dst, CopyType ctype) {
-  copy_inplace_dispatch(src, dst, ctype);
+void copy_cpu_inplace(
+    const array& src,
+    array& dst,
+    CopyType ctype,
+    Stream stream) {
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(src);
+  encoder.set_output_array(dst);
+  encoder.dispatch(
+      [src = array::unsafe_weak_copy(src),
+       dst = array::unsafe_weak_copy(dst),
+       ctype]() mutable { copy_inplace_dispatch(src, dst, ctype); });
 }

-void copy(const array& src, array& dst, CopyType ctype) {
-  // Allocate the output
-  switch (ctype) {
-    case CopyType::Vector:
-      if (src.is_donatable() && src.itemsize() == dst.itemsize()) {
-        dst.copy_shared_buffer(src);
-      } else {
-        auto size = src.data_size();
-        dst.set_data(
-            allocator::malloc_or_wait(size * dst.itemsize()),
-            size,
-            src.strides(),
-            src.flags());
-      }
-      break;
-    case CopyType::Scalar:
-    case CopyType::General:
-    case CopyType::GeneralGeneral:
-      dst.set_data(allocator::malloc_or_wait(dst.nbytes()));
-      break;
+void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream) {
+  bool donated = set_copy_output_data(src, dst, ctype);
+  if (donated && src.dtype() == dst.dtype()) {
+    // If the output has the same type as the input then there is nothing to
+    // copy, just use the buffer.
+    return;
  }
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
-  copy_inplace(src, dst, ctype);
+  copy_cpu_inplace(src, dst, ctype, stream);
 }

-void copy_inplace(
+void copy_cpu_inplace(
    const array& src,
    array& dst,
    const Shape& data_shape,
@@ -298,24 +330,57 @@ void copy_inplace(
    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
-    CopyType ctype) {
-  switch (ctype) {
-    case CopyType::General:
-    case CopyType::GeneralGeneral:
-      copy_inplace_dispatch(
-          src,
-          dst,
-          ctype,
-          data_shape,
-          i_strides,
-          o_strides,
-          i_offset,
-          o_offset);
-      break;
-    case CopyType::Scalar:
-    case CopyType::Vector:
-      copy_inplace_dispatch(src, dst, ctype);
-  }
+    CopyType ctype,
+    Stream stream,
+    const std::optional<array>& dynamic_i_offset, /* = std::nullopt */
+    const std::optional<array>& dynamic_o_offset /* = std::nullopt */) {
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(src);
+  encoder.set_output_array(dst);
+  auto weak_copy_if_set = [](auto x) -> std::optional<array> {
+    if (x) {
+      return array::unsafe_weak_copy(*x);
+    } else {
+      return std::nullopt;
+    }
+  };
+  encoder.dispatch(
+      [src = array::unsafe_weak_copy(src),
+       dst = array::unsafe_weak_copy(dst),
+       data_shape,
+       i_strides,
+       o_strides,
+       i_offset,
+       o_offset,
+       ctype,
+       dynamic_i_offset = weak_copy_if_set(dynamic_i_offset),
+       dynamic_o_offset = weak_copy_if_set(dynamic_o_offset)]() mutable {
+        switch (ctype) {
+          case CopyType::General:
+          case CopyType::GeneralGeneral:
+            copy_inplace_dispatch(
+                src,
+                dst,
+                ctype,
+                data_shape,
+                i_strides,
+                o_strides,
+                i_offset,
+                o_offset,
+                dynamic_i_offset,
+                dynamic_o_offset);
+            break;
+          case CopyType::Scalar:
+          case CopyType::Vector:
+            copy_inplace_dispatch(src, dst, ctype);
+        }
+      });
+}
+
+array contiguous_copy_cpu(const array& arr, Stream stream) {
+  array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+  copy_cpu(arr, arr_copy, CopyType::General, stream);
+  return arr_copy;
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/copy.h
+++ b/mlx/backend/cpu/copy.h
@@ -2,16 +2,22 @@

 #pragma once

+#include <optional>
+
 #include "mlx/array.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"

 namespace mlx::core {

-void copy(const array& src, array& dst, CopyType ctype);
-void copy_inplace(const array& src, array& dst, CopyType ctype);
+void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream);
+void copy_cpu_inplace(
+    const array& src,
+    array& dst,
+    CopyType ctype,
+    Stream stream);

-void copy_inplace(
+void copy_cpu_inplace(
    const array& src,
    array& dst,
    const Shape& data_shape,
@@ -19,6 +25,12 @@ void copy_inplace(
    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
-    CopyType ctype);
+    CopyType ctype,
+    Stream stream,
+    const std::optional<array>& dynamic_i_offset = std::nullopt,
+    const std::optional<array>& dynamic_o_offset = std::nullopt);
+
+// Return a contiguous array with same shape that copies the data of |arr|.
+array contiguous_copy_cpu(const array& arr, Stream stream);

 } // namespace mlx::core
--- a/mlx/backend/cpu/distributed.cpp
+++ b/mlx/backend/cpu/distributed.cpp
@@ -0,0 +1,98 @@
+// Copyright © 2024 Apple Inc.
+
+#include <cassert>
+
+#include "mlx/allocator.h"
+#include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
+#include "mlx/distributed/primitives.h"
+
+namespace mlx::core::distributed {
+
+std::pair<array, bool> ensure_row_contiguous(const array& arr, Stream stream) {
+  if (arr.flags().row_contiguous) {
+    return {arr, false};
+  } else {
+    return {contiguous_copy_cpu(arr, stream), true};
+  }
+};
+
+void AllReduce::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 1);
+  assert(outputs.size() == 1);
+
+  auto donate_or_copy = [s = stream()](const array& in, array& out) {
+    if (in.flags().row_contiguous) {
+      if (in.is_donatable()) {
+        out.copy_shared_buffer(in);
+      } else {
+        out.set_data(allocator::malloc(out.nbytes()));
+      }
+      return in;
+    } else {
+      array arr_copy = contiguous_copy_cpu(in, s);
+      out.copy_shared_buffer(arr_copy);
+      return arr_copy;
+    }
+  };
+
+  auto in = donate_or_copy(inputs[0], outputs[0]);
+  switch (reduce_type_) {
+    case Sum:
+      distributed::detail::all_sum(group(), in, outputs[0], stream());
+      break;
+    case Max:
+      distributed::detail::all_max(group(), in, outputs[0], stream());
+      break;
+    case Min:
+      distributed::detail::all_min(group(), in, outputs[0], stream());
+      break;
+    default:
+      throw std::runtime_error(
+          "Only all reduce sum, min and max are supported for now");
+  }
+}
+
+void AllGather::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 1);
+  assert(outputs.size() == 1);
+
+  auto [in, copied] = ensure_row_contiguous(inputs[0], stream());
+  outputs[0].set_data(allocator::malloc(outputs[0].nbytes()));
+  distributed::detail::all_gather(group(), in, outputs[0], stream());
+  if (copied) {
+    auto& enc = cpu::get_command_encoder(stream());
+    enc.add_temporary(in);
+  }
+}
+
+void Send::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 1);
+  assert(outputs.size() == 1);
+
+  auto [in, copied] = ensure_row_contiguous(inputs[0], stream());
+  distributed::detail::send(group(), in, dst_, stream());
+  outputs[0].copy_shared_buffer(inputs[0]);
+  if (copied) {
+    auto& enc = cpu::get_command_encoder(stream());
+    enc.add_temporary(in);
+  }
+}
+
+void Recv::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 0);
+  assert(outputs.size() == 1);
+
+  outputs[0].set_data(allocator::malloc(outputs[0].nbytes()));
+  distributed::detail::recv(group(), outputs[0], src_, stream());
+}
+
+} // namespace mlx::core::distributed
--- a/mlx/backend/cpu/eig.cpp
+++ b/mlx/backend/cpu/eig.cpp
@@ -0,0 +1,174 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/allocator.h"
+#include "mlx/array.h"
+#include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
+#include "mlx/backend/cpu/lapack.h"
+#include "mlx/linalg.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T>
+void eig_impl(
+    array& a,
+    array& vectors,
+    array& values,
+    bool compute_eigenvectors,
+    Stream stream) {
+  using OT = std::complex<T>;
+  auto a_ptr = a.data<T>();
+  auto eig_ptr = values.data<OT>();
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_output_array(values);
+  OT* vec_ptr = nullptr;
+  if (compute_eigenvectors) {
+    encoder.set_output_array(vectors);
+    vec_ptr = vectors.data<OT>();
+  }
+  encoder.dispatch([a_ptr,
+                    vec_ptr,
+                    eig_ptr,
+                    compute_eigenvectors,
+                    N = vectors.shape(-1),
+                    size = vectors.size()]() mutable {
+    // Work query
+    char jobr = 'N';
+    char jobl = compute_eigenvectors ? 'V' : 'N';
+    int n_vecs_r = 1;
+    int n_vecs_l = compute_eigenvectors ? N : 1;
+    int lwork = -1;
+    int info;
+    {
+      T work;
+      int iwork;
+      geev<T>(
+          &jobl,
+          &jobr,
+          &N,
+          nullptr,
+          &N,
+          nullptr,
+          nullptr,
+          nullptr,
+          &n_vecs_l,
+          nullptr,
+          &n_vecs_r,
+          &work,
+          &lwork,
+          &info);
+      lwork = static_cast<int>(work);
+    }
+
+    auto eig_tmp_data = array::Data{allocator::malloc(sizeof(T) * N * 2)};
+    auto vec_tmp_data =
+        array::Data{allocator::malloc(vec_ptr ? sizeof(T) * N * N * 2 : 0)};
+    auto eig_tmp = static_cast<T*>(eig_tmp_data.buffer.raw_ptr());
+    auto vec_tmp = static_cast<T*>(vec_tmp_data.buffer.raw_ptr());
+    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
+    for (size_t i = 0; i < size / (N * N); ++i) {
+      geev<T>(
+          &jobl,
+          &jobr,
+          &N,
+          a_ptr,
+          &N,
+          eig_tmp,
+          eig_tmp + N,
+          vec_tmp,
+          &n_vecs_l,
+          nullptr,
+          &n_vecs_r,
+          static_cast<T*>(work_buf.buffer.raw_ptr()),
+          &lwork,
+          &info);
+      for (int i = 0; i < N; ++i) {
+        eig_ptr[i] = {eig_tmp[i], eig_tmp[N + i]};
+      }
+      if (vec_ptr) {
+        for (int i = 0; i < N; ++i) {
+          if (eig_ptr[i].imag() != 0) {
+            // This vector and the next are a pair
+            for (int j = 0; j < N; ++j) {
+              vec_ptr[i * N + j] = {
+                  vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]};
+              vec_ptr[(i + 1) * N + j] = {
+                  vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]};
+            }
+            i += 1;
+          } else {
+            for (int j = 0; j < N; ++j) {
+              vec_ptr[i * N + j] = {vec_tmp[i * N + j], 0};
+            }
+          }
+        }
+        vec_ptr += N * N;
+      }
+      a_ptr += N * N;
+      eig_ptr += N;
+      if (info != 0) {
+        std::stringstream msg;
+        msg << "[Eig::eval_cpu] Eigenvalue decomposition failed with error code "
+            << info;
+        throw std::runtime_error(msg.str());
+      }
+    }
+  });
+  encoder.add_temporary(a);
+}
+
+} // namespace
+
+void Eig::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  const auto& a = inputs[0];
+  auto& values = outputs[0];
+
+  auto vectors = compute_eigenvectors_
+      ? outputs[1]
+      : array(a.shape(), complex64, nullptr, {});
+
+  auto a_copy = array(a.shape(), a.dtype(), nullptr, {});
+  copy_cpu(
+      a,
+      a_copy,
+      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
+      stream());
+
+  values.set_data(allocator::malloc(values.nbytes()));
+
+  if (compute_eigenvectors_) {
+    // Set the strides and flags so the eigenvectors
+    // are in the columns of the output
+    auto flags = vectors.flags();
+    auto strides = vectors.strides();
+    auto ndim = a.ndim();
+    std::swap(strides[ndim - 1], strides[ndim - 2]);
+
+    if (a.size() > 1) {
+      flags.row_contiguous = false;
+      if (ndim > 2) {
+        flags.col_contiguous = false;
+      } else {
+        flags.col_contiguous = true;
+      }
+    }
+    vectors.set_data(
+        allocator::malloc(vectors.nbytes()), vectors.size(), strides, flags);
+  }
+  switch (a.dtype()) {
+    case float32:
+      eig_impl<float>(a_copy, vectors, values, compute_eigenvectors_, stream());
+      break;
+    default:
+      throw std::runtime_error("[Eig::eval_cpu] only supports float32.");
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/eigh.cpp
+++ b/mlx/backend/cpu/eigh.cpp
@@ -3,6 +3,7 @@
 #include "mlx/allocator.h"
 #include "mlx/array.h"
 #include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/lapack.h"
 #include "mlx/linalg.h"
 #include "mlx/primitives.h"
@@ -11,28 +12,30 @@ namespace mlx::core {

 namespace {

+template <typename T, class Enable = void>
+struct EighWork {};
+
 template <typename T>
-void eigh_impl(
-    array& vectors,
-    array& values,
-    const std::string& uplo,
-    bool compute_eigenvectors) {
-  auto vec_ptr = vectors.data<T>();
-  auto eig_ptr = values.data<T>();
+struct EighWork<
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  using R = T;

-  char jobz = compute_eigenvectors ? 'V' : 'N';
-  auto N = vectors.shape(-1);
-
-  // Work query
-  int lwork = -1;
-  int liwork = -1;
+  char jobz;
+  char uplo;
+  int N;
+  int lwork;
+  int liwork;
  int info;
-  {
+  std::vector<array::Data> buffers;
+
+  EighWork(char jobz_, char uplo_, int N_)
+      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), liwork(-1) {
    T work;
    int iwork;
    syevd<T>(
        &jobz,
-        uplo.c_str(),
+        &uplo,
        &N,
        nullptr,
        &N,
@@ -44,32 +47,139 @@ void eigh_impl(
        &info);
    lwork = static_cast<int>(work);
    liwork = iwork;
+    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
+    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
  }

-  auto work_buf = array::Data{allocator::malloc_or_wait(sizeof(T) * lwork)};
-  auto iwork_buf = array::Data{allocator::malloc_or_wait(sizeof(int) * liwork)};
-  for (size_t i = 0; i < vectors.size() / (N * N); ++i) {
+  void run(T* vectors, T* values) {
    syevd<T>(
        &jobz,
-        uplo.c_str(),
+        &uplo,
        &N,
-        vec_ptr,
+        vectors,
        &N,
-        eig_ptr,
-        static_cast<T*>(work_buf.buffer.raw_ptr()),
+        values,
+        static_cast<T*>(buffers[0].buffer.raw_ptr()),
        &lwork,
-        static_cast<int*>(iwork_buf.buffer.raw_ptr()),
+        static_cast<int*>(buffers[1].buffer.raw_ptr()),
        &liwork,
        &info);
-    vec_ptr += N * N;
-    eig_ptr += N;
-    if (info != 0) {
-      std::stringstream msg;
-      msg << "[Eigh::eval_cpu] Eigenvalue decomposition failed with error code "
-          << info;
-      throw std::runtime_error(msg.str());
+  }
+};
+
+template <>
+struct EighWork<std::complex<float>> {
+  using T = std::complex<float>;
+  using R = float;
+
+  char jobz;
+  char uplo;
+  int N;
+  int lwork;
+  int lrwork;
+  int liwork;
+  int info;
+  std::vector<array::Data> buffers;
+
+  EighWork(char jobz_, char uplo_, int N_)
+      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), lrwork(-1), liwork(-1) {
+    T work;
+    R rwork;
+    int iwork;
+    heevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        nullptr,
+        &N,
+        nullptr,
+        &work,
+        &lwork,
+        &rwork,
+        &lrwork,
+        &iwork,
+        &liwork,
+        &info);
+    lwork = static_cast<int>(work.real());
+    lrwork = static_cast<int>(rwork);
+    liwork = iwork;
+    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
+    buffers.emplace_back(allocator::malloc(sizeof(R) * lrwork));
+    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
+  }
+
+  void run(T* vectors, R* values) {
+    heevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        vectors,
+        &N,
+        values,
+        static_cast<T*>(buffers[0].buffer.raw_ptr()),
+        &lwork,
+        static_cast<R*>(buffers[1].buffer.raw_ptr()),
+        &lrwork,
+        static_cast<int*>(buffers[2].buffer.raw_ptr()),
+        &liwork,
+        &info);
+    if (jobz == 'V') {
+      // We have pre-transposed the vectors but we also must conjugate them
+      // when they are complex.
+      //
+      // We could vectorize this but it is so fast in comparison to heevd that
+      // it doesn't really matter.
+      for (int i = 0; i < N; i++) {
+        for (int j = 0; j < N; j++) {
+          *vectors = std::conj(*vectors);
+          vectors++;
+        }
+      }
    }
  }
+};
+
+template <typename T>
+void eigh_impl(
+    array& vectors,
+    array& values,
+    const std::string& uplo,
+    bool compute_eigenvectors,
+    Stream stream) {
+  using R = typename EighWork<T>::R;
+
+  auto vec_ptr = vectors.data<T>();
+  auto eig_ptr = values.data<R>();
+  char jobz = compute_eigenvectors ? 'V' : 'N';
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_output_array(vectors);
+  encoder.set_output_array(values);
+  encoder.dispatch([vec_ptr,
+                    eig_ptr,
+                    jobz,
+                    uplo = uplo[0],
+                    N = vectors.shape(-1),
+                    size = vectors.size()]() mutable {
+    // Work query
+    EighWork<T> work(jobz, uplo, N);
+
+    // Work loop
+    for (size_t i = 0; i < size / (N * N); ++i) {
+      work.run(vec_ptr, eig_ptr);
+      vec_ptr += N * N;
+      eig_ptr += N;
+      if (work.info != 0) {
+        std::stringstream msg;
+        msg << "[Eigh::eval_cpu] Eigenvalue decomposition failed with error code "
+            << work.info;
+        throw std::runtime_error(msg.str());
+      }
+    }
+  });
+  if (!compute_eigenvectors) {
+    encoder.add_temporary(vectors);
+  }
 }

 } // namespace
@@ -84,12 +194,13 @@ void Eigh::eval_cpu(
      ? outputs[1]
      : array(a.shape(), a.dtype(), nullptr, {});

-  values.set_data(allocator::malloc_or_wait(values.nbytes()));
+  values.set_data(allocator::malloc(values.nbytes()));

-  copy(
+  copy_cpu(
      a,
      vectors,
-      a.flags().row_contiguous ? CopyType::Vector : CopyType::General);
+      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
+      stream());

  if (compute_eigenvectors_) {
    // Set the strides and flags so the eigenvectors
@@ -107,14 +218,19 @@ void Eigh::eval_cpu(
        flags.col_contiguous = true;
      }
    }
-    vectors.move_shared_buffer(vectors, strides, flags, vectors.data_size());
+    vectors.copy_shared_buffer(vectors, strides, flags, vectors.data_size());
  }
  switch (a.dtype()) {
    case float32:
-      eigh_impl<float>(vectors, values, uplo_, compute_eigenvectors_);
+      eigh_impl<float>(vectors, values, uplo_, compute_eigenvectors_, stream());
      break;
    case float64:
-      eigh_impl<double>(vectors, values, uplo_, compute_eigenvectors_);
+      eigh_impl<double>(
+          vectors, values, uplo_, compute_eigenvectors_, stream());
+      break;
+    case complex64:
+      eigh_impl<std::complex<float>>(
+          vectors, values, uplo_, compute_eigenvectors_, stream());
      break;
    default:
      throw std::runtime_error(
--- a/mlx/backend/cpu/encoder.cpp
+++ b/mlx/backend/cpu/encoder.cpp
@@ -0,0 +1,16 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cpu/encoder.h"
+
+namespace mlx::core::cpu {
+
+CommandEncoder& get_command_encoder(Stream stream) {
+  static std::unordered_map<int, CommandEncoder> encoder_map;
+  auto it = encoder_map.find(stream.index);
+  if (it == encoder_map.end()) {
+    it = encoder_map.emplace(stream.index, stream).first;
+  }
+  return it->second;
+}
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/encoder.h
+++ b/mlx/backend/cpu/encoder.h
@@ -0,0 +1,67 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "mlx/array.h"
+#include "mlx/scheduler.h"
+
+namespace mlx::core::cpu {
+
+// Number of dispatches per scheduler task
+constexpr int DISPATCHES_PER_TASK = 10;
+
+struct CommandEncoder {
+  CommandEncoder(Stream stream) : stream_(stream) {}
+
+  CommandEncoder(const CommandEncoder&) = delete;
+  CommandEncoder& operator=(const CommandEncoder&) = delete;
+  CommandEncoder(CommandEncoder&&) = delete;
+  CommandEncoder& operator=(CommandEncoder&&) = delete;
+
+  void set_input_array(const array& a) {}
+  void set_output_array(array& a) {}
+
+  // Hold onto a temporary until any already scheduled tasks which use it as
+  // an input are complete.
+  void add_temporary(array arr) {
+    temporaries_.push_back(std::move(arr));
+  }
+
+  void add_temporaries(std::vector<array> arrays) {
+    temporaries_.insert(
+        temporaries_.end(),
+        std::make_move_iterator(arrays.begin()),
+        std::make_move_iterator(arrays.end()));
+  }
+
+  std::vector<array>& temporaries() {
+    return temporaries_;
+  }
+
+  template <class F, class... Args>
+  void dispatch(F&& f, Args&&... args) {
+    num_ops_ = (num_ops_ + 1) % DISPATCHES_PER_TASK;
+    auto task = std::bind(std::forward<F>(f), std::forward<Args>(args)...);
+    if (num_ops_ == 0) {
+      scheduler::notify_new_task(stream_);
+      auto task_wrap = [s = stream_, task = std::move(task)]() mutable {
+        task();
+        scheduler::notify_task_completion(s);
+      };
+      scheduler::enqueue(stream_, std::move(task_wrap));
+    } else {
+      scheduler::enqueue(stream_, std::move(task));
+    }
+  }
+
+ private:
+  Stream stream_;
+  std::vector<array> temporaries_;
+  int num_ops_{0};
+};
+
+CommandEncoder& get_command_encoder(Stream stream);
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/eval.cpp
+++ b/mlx/backend/cpu/eval.cpp
@@ -0,0 +1,40 @@
+// Copyright © 2025 Apple Inc.
+#include "mlx/backend/cpu/eval.h"
+#include "mlx/backend/cpu/encoder.h"
+#include "mlx/primitives.h"
+#include "mlx/scheduler.h"
+#include "mlx/utils.h"
+
+namespace mlx::core::cpu {
+
+void eval(array& arr) {
+  auto s = arr.primitive().stream();
+
+  auto outputs = arr.outputs();
+  {
+    // If the array is a tracer hold a reference
+    // to its inputs so they don't get donated
+    std::vector<array> inputs;
+    if (arr.is_tracer()) {
+      inputs = arr.inputs();
+    }
+    arr.primitive().eval_cpu(arr.inputs(), outputs);
+  }
+
+  std::unordered_set<std::shared_ptr<array::Data>> buffers;
+  for (auto& in : arr.inputs()) {
+    buffers.insert(in.data_shared_ptr());
+  }
+  for (auto& s : arr.siblings()) {
+    buffers.insert(s.data_shared_ptr());
+  }
+  // Remove the output if it was donated to by an input
+  if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
+    buffers.erase(it);
+  }
+  auto& encoder = cpu::get_command_encoder(s);
+  encoder.dispatch([buffers = std::move(buffers),
+                    temps = std::move(encoder.temporaries())]() {});
+}
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/eval.h
+++ b/mlx/backend/cpu/eval.h
@@ -0,0 +1,12 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+#include "mlx/stream.h"
+
+namespace mlx::core::cpu {
+
+void eval(array& arr);
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/fft.cpp
+++ b/mlx/backend/cpu/fft.cpp
@@ -4,6 +4,7 @@

 #include "mlx/3rdparty/pocketfft.h"
 #include "mlx/allocator.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -21,7 +22,7 @@ void FFT::eval_cpu(const std::vector<array>& inputs, array& out) {
    s *= out.itemsize();
  }

-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  out.set_data(allocator::malloc(out.nbytes()));

  std::vector<size_t> shape;
  if (out.dtype() == float32) {
@@ -38,46 +39,78 @@ void FFT::eval_cpu(const std::vector<array>& inputs, array& out) {
        });
    scale /= nelem;
  }
+
+  auto& encoder = cpu::get_command_encoder(stream());
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+
  if (in.dtype() == complex64 && out.dtype() == complex64) {
    auto in_ptr =
        reinterpret_cast<const std::complex<float>*>(in.data<complex64_t>());
    auto out_ptr =
        reinterpret_cast<std::complex<float>*>(out.data<complex64_t>());
-    pocketfft::c2c(
-        shape,
-        strides_in,
-        strides_out,
-        axes_,
-        !inverse_,
-        in_ptr,
-        out_ptr,
-        scale);
+    encoder.dispatch([shape = std::move(shape),
+                      strides_in = std::move(strides_in),
+                      strides_out = std::move(strides_out),
+                      axes = axes_,
+                      inverse = inverse_,
+                      in_ptr,
+                      out_ptr,
+                      scale]() {
+      pocketfft::c2c(
+          shape,
+          strides_in,
+          strides_out,
+          axes,
+          !inverse,
+          in_ptr,
+          out_ptr,
+          scale);
+    });
  } else if (in.dtype() == float32 && out.dtype() == complex64) {
    auto in_ptr = in.data<float>();
    auto out_ptr =
        reinterpret_cast<std::complex<float>*>(out.data<complex64_t>());
-    pocketfft::r2c(
-        shape,
-        strides_in,
-        strides_out,
-        axes_,
-        !inverse_,
-        in_ptr,
-        out_ptr,
-        scale);
+    encoder.dispatch([shape = std::move(shape),
+                      strides_in = std::move(strides_in),
+                      strides_out = std::move(strides_out),
+                      axes = axes_,
+                      inverse = inverse_,
+                      in_ptr,
+                      out_ptr,
+                      scale]() {
+      pocketfft::r2c(
+          shape,
+          strides_in,
+          strides_out,
+          axes,
+          !inverse,
+          in_ptr,
+          out_ptr,
+          scale);
+    });
  } else if (in.dtype() == complex64 && out.dtype() == float32) {
    auto in_ptr =
        reinterpret_cast<const std::complex<float>*>(in.data<complex64_t>());
    auto out_ptr = out.data<float>();
-    pocketfft::c2r(
-        shape,
-        strides_in,
-        strides_out,
-        axes_,
-        !inverse_,
-        in_ptr,
-        out_ptr,
-        scale);
+    encoder.dispatch([shape = std::move(shape),
+                      strides_in = std::move(strides_in),
+                      strides_out = std::move(strides_out),
+                      axes = axes_,
+                      inverse = inverse_,
+                      in_ptr,
+                      out_ptr,
+                      scale]() {
+      pocketfft::c2r(
+          shape,
+          strides_in,
+          strides_out,
+          axes,
+          !inverse,
+          in_ptr,
+          out_ptr,
+          scale);
+    });
  } else {
    throw std::runtime_error(
        "[FFT] Received unexpected input and output type combination.");
--- a/mlx/backend/cpu/gemm.h
+++ b/mlx/backend/cpu/gemm.h
@@ -7,14 +7,20 @@ namespace mlx::core {

 template <typename T>
 void matmul(
-    const array& a,
-    const array& b,
-    array& out,
+    const T* a,
+    const T* b,
+    T* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
+    size_t ldc,
    float alpha,
-    float beta);
+    float beta,
+    size_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides);

 } // namespace mlx::core
--- a/mlx/backend/cpu/gemms/bnns.cpp
+++ b/mlx/backend/cpu/gemms/bnns.cpp
@@ -9,39 +9,46 @@

 namespace mlx::core {

-BNNSDataType to_bnns_dtype(Dtype mlx_dtype) {
-  uint32_t size_bits = size_of(mlx_dtype) * 8;
-  switch (kindof(mlx_dtype)) {
-    case Dtype::Kind::b:
-      return BNNSDataTypeBoolean;
-    case Dtype::Kind::u:
-      return BNNSDataType(BNNSDataTypeUIntBit | size_bits);
-    case Dtype::Kind::i:
-      return BNNSDataType(BNNSDataTypeIntBit | size_bits);
-    case Dtype::Kind::f:
-      return BNNSDataType(BNNSDataTypeFloatBit | size_bits);
-    case Dtype::Kind::V:
-      return BNNSDataTypeBFloat16;
-    case Dtype::Kind::c:
-      throw std::invalid_argument("BNNS does not support complex types");
-  }
+template <typename T>
+constexpr BNNSDataType to_bnns_dtype();
+
+template <>
+constexpr BNNSDataType to_bnns_dtype<float>() {
+  return BNNSDataType(BNNSDataTypeFloatBit | 32);
+}
+template <>
+constexpr BNNSDataType to_bnns_dtype<float16_t>() {
+  return BNNSDataType(BNNSDataTypeFloatBit | 16);
 }

+template <>
+constexpr BNNSDataType to_bnns_dtype<bfloat16_t>() {
+  return BNNSDataTypeBFloat16;
+}
+
+template <typename T>
 void matmul_bnns(
-    const array& a,
-    const array& b,
-    array& out,
+    const T* a,
+    const T* b,
+    T* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
+    size_t ldc,
    float alpha,
-    float beta) {
-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
+    float beta,
+    size_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  auto ndim = a_shape.size();
+  size_t M = a_shape[ndim - 2];
+  size_t N = b_shape[ndim - 1];
+  size_t K = a_shape[ndim - 1];

-  BNNSDataType bnns_dtype = to_bnns_dtype(out.dtype());
+  BNNSDataType bnns_dtype = to_bnns_dtype<T>();

 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
@@ -115,14 +122,14 @@ void matmul_bnns(
  auto bnns_filter =
      BNNSFilterCreateLayerBroadcastMatMul(&gemm_params, nullptr);

-  for (int i = 0; i < (a.size() / (M * K)); ++i) {
+  for (int i = 0; i < batch_size; ++i) {
    BNNSFilterApplyTwoInput(
        bnns_filter,
-        a.data<uint8_t>() +
-            elem_to_loc(M * K * i, a.shape(), a.strides()) * a.itemsize(),
-        b.data<uint8_t>() +
-            elem_to_loc(K * N * i, b.shape(), b.strides()) * b.itemsize(),
-        out.data<uint8_t>() + M * N * i * out.itemsize());
+        reinterpret_cast<const uint8_t*>(
+            a + elem_to_loc(M * K * i, a_shape, a_strides)),
+        reinterpret_cast<const uint8_t*>(
+            b + elem_to_loc(K * N * i, b_shape, b_strides)),
+        reinterpret_cast<uint8_t*>(out + M * N * i));
  }

  BNNSFilterDestroy(bnns_filter);
@@ -131,30 +138,72 @@ void matmul_bnns(

 template <>
 void matmul<float16_t>(
-    const array& a,
-    const array& b,
-    array& out,
+    const float16_t* a,
+    const float16_t* b,
+    float16_t* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
+    size_t ldc,
    float alpha,
-    float beta) {
-  matmul_bnns(a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
+    float beta,
+    size_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  matmul_bnns(
+      a,
+      b,
+      out,
+      a_transposed,
+      b_transposed,
+      lda,
+      ldb,
+      ldc,
+      alpha,
+      beta,
+      batch_size,
+      a_shape,
+      a_strides,
+      b_shape,
+      b_strides);
 }

 template <>
 void matmul<bfloat16_t>(
-    const array& a,
-    const array& b,
-    array& out,
+    const bfloat16_t* a,
+    const bfloat16_t* b,
+    bfloat16_t* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
+    size_t ldc,
    float alpha,
-    float beta) {
-  matmul_bnns(a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
+    float beta,
+    size_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  matmul_bnns(
+      a,
+      b,
+      out,
+      a_transposed,
+      b_transposed,
+      lda,
+      ldb,
+      ldc,
+      alpha,
+      beta,
+      batch_size,
+      a_shape,
+      a_strides,
+      b_shape,
+      b_strides);
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/gemms/cblas.cpp
+++ b/mlx/backend/cpu/gemms/cblas.cpp
@@ -8,20 +8,27 @@ namespace mlx::core {

 template <>
 void matmul<float>(
-    const array& a,
-    const array& b,
-    array& out,
+    const float* a,
+    const float* b,
+    float* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
+    size_t ldc,
    float alpha,
-    float beta) {
-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
+    float beta,
+    size_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  auto ndim = a_shape.size();
+  size_t M = a_shape[ndim - 2];
+  size_t N = b_shape[ndim - 1];
+  size_t K = a_shape[ndim - 1];

-  for (int i = 0; i < (a.size() / (M * K)); ++i) {
+  for (int i = 0; i < batch_size; ++i) {
    cblas_sgemm(
        CblasRowMajor,
        a_transposed ? CblasTrans : CblasNoTrans, // transA
@@ -29,34 +36,40 @@ void matmul<float>(
        M,
        N,
        K,
-        alpha, // alpha
-        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
+        alpha,
+        a + elem_to_loc(M * K * i, a_shape, a_strides),
        lda,
-        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
+        b + elem_to_loc(K * N * i, b_shape, b_strides),
        ldb,
-        beta, // beta
-        out.data<float>() + M * N * i,
-        out.shape(-1) // ldc
-    );
+        beta,
+        out + M * N * i,
+        ldc);
  }
 }

 template <>
 void matmul<double>(
-    const array& a,
-    const array& b,
-    array& out,
+    const double* a,
+    const double* b,
+    double* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
+    size_t ldc,
    float alpha,
-    float beta) {
-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
+    float beta,
+    size_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  auto ndim = a_shape.size();
+  size_t M = a_shape[ndim - 2];
+  size_t N = b_shape[ndim - 1];
+  size_t K = a_shape[ndim - 1];

-  for (int i = 0; i < (a.size() / (M * K)); ++i) {
+  for (int i = 0; i < batch_size; ++i) {
    cblas_dgemm(
        CblasRowMajor,
        a_transposed ? CblasTrans : CblasNoTrans, // transA
@@ -64,15 +77,14 @@ void matmul<double>(
        M,
        N,
        K,
-        alpha, // alpha
-        a.data<double>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
+        alpha,
+        a + elem_to_loc(M * K * i, a_shape, a_strides),
        lda,
-        b.data<double>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
+        b + elem_to_loc(K * N * i, b_shape, b_strides),
        ldb,
-        beta, // beta
-        out.data<double>() + M * N * i,
-        out.shape(-1) // ldc
-    );
+        beta,
+        out + M * N * i,
+        ldc);
  }
 }

--- a/mlx/backend/cpu/gemms/no_bf16.cpp
+++ b/mlx/backend/cpu/gemms/no_bf16.cpp
@@ -1,21 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cpu/gemm.h"
-
-namespace mlx::core {
-
-template <>
-void matmul<bfloat16_t>(
-    const array&,
-    const array&,
-    array&,
-    bool,
-    bool,
-    size_t,
-    size_t,
-    float,
-    float) {
-  throw std::runtime_error("[Matmul::eval_cpu] bfloat16 not supported.");
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/no_fp16.cpp
+++ b/mlx/backend/cpu/gemms/no_fp16.cpp
@@ -1,21 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cpu/gemm.h"
-
-namespace mlx::core {
-
-template <>
-void matmul<float16_t>(
-    const array&,
-    const array&,
-    array&,
-    bool,
-    bool,
-    size_t,
-    size_t,
-    float,
-    float) {
-  throw std::runtime_error("[Matmul::eval_cpu] float16 not supported.");
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/simd_bf16.cpp
+++ b/mlx/backend/cpu/gemms/simd_bf16.cpp
@@ -0,0 +1,45 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cpu/gemm.h"
+#include "mlx/backend/cpu/gemms/simd_gemm.h"
+
+namespace mlx::core {
+
+template <>
+void matmul<bfloat16_t>(
+    const bfloat16_t* a,
+    const bfloat16_t* b,
+    bfloat16_t* out,
+    bool a_transposed,
+    bool b_transposed,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    float alpha,
+    float beta,
+    size_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  auto ndim = a_shape.size();
+  size_t M = a_shape[ndim - 2];
+  size_t N = b_shape[ndim - 1];
+  size_t K = a_shape[ndim - 1];
+  for (int i = 0; i < batch_size; ++i) {
+    simd_gemm<bfloat16_t, float>(
+        a + elem_to_loc(M * K * i, a_shape, a_strides),
+        b + elem_to_loc(K * N * i, b_shape, b_strides),
+        out + M * N * i,
+        a_transposed,
+        b_transposed,
+        M,
+        N,
+        K,
+        alpha,
+        beta);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/simd_fp16.cpp
+++ b/mlx/backend/cpu/gemms/simd_fp16.cpp
@@ -0,0 +1,45 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cpu/gemm.h"
+#include "mlx/backend/cpu/gemms/simd_gemm.h"
+
+namespace mlx::core {
+
+template <>
+void matmul<float16_t>(
+    const float16_t* a,
+    const float16_t* b,
+    float16_t* out,
+    bool a_transposed,
+    bool b_transposed,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    float alpha,
+    float beta,
+    size_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  auto ndim = a_shape.size();
+  size_t M = a_shape[ndim - 2];
+  size_t N = b_shape[ndim - 1];
+  size_t K = a_shape[ndim - 1];
+  for (int i = 0; i < batch_size; ++i) {
+    simd_gemm<float16_t, float>(
+        a + elem_to_loc(M * K * i, a_shape, a_strides),
+        b + elem_to_loc(K * N * i, b_shape, b_strides),
+        out + M * N * i,
+        a_transposed,
+        b_transposed,
+        M,
+        N,
+        K,
+        alpha,
+        beta);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/simd_gemm.h
+++ b/mlx/backend/cpu/gemms/simd_gemm.h
@@ -0,0 +1,139 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+
+#include "mlx/backend/cpu/simd/simd.h"
+
+namespace mlx::core {
+
+inline int ceildiv(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+template <int block_size, typename T, typename AccT>
+void load_block(
+    const T* in,
+    AccT* out,
+    int M,
+    int N,
+    int i,
+    int j,
+    bool transpose) {
+  if (transpose) {
+    for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
+      for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
+        out[jj * block_size + ii] =
+            in[(i * block_size + ii) * N + j * block_size + jj];
+      }
+    }
+  } else {
+    for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
+      for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
+        out[ii * block_size + jj] =
+            in[(i * block_size + ii) * N + j * block_size + jj];
+      }
+    }
+  }
+}
+
+template <typename T, typename AccT>
+void simd_gemm(
+    const T* a,
+    const T* b,
+    T* c,
+    bool a_trans,
+    bool b_trans,
+    int M,
+    int N,
+    int K,
+    float alpha,
+    float beta) {
+  constexpr int block_size = 16;
+  constexpr int simd_size = simd::max_size<AccT>;
+  static_assert(
+      (block_size % simd_size) == 0,
+      "Block size must be divisible by SIMD size");
+
+  int last_k_block_size = K - block_size * (K / block_size);
+  int last_k_simd_block = (last_k_block_size / simd_size) * simd_size;
+  for (int i = 0; i < ceildiv(M, block_size); i++) {
+    for (int j = 0; j < ceildiv(N, block_size); j++) {
+      AccT c_block[block_size * block_size] = {0.0};
+      AccT a_block[block_size * block_size];
+      AccT b_block[block_size * block_size];
+
+      int k = 0;
+      for (; k < K / block_size; k++) {
+        // Load a and b blocks
+        if (a_trans) {
+          load_block<block_size>(a, a_block, K, M, k, i, true);
+        } else {
+          load_block<block_size>(a, a_block, M, K, i, k, false);
+        }
+        if (b_trans) {
+          load_block<block_size>(b, b_block, N, K, j, k, false);
+        } else {
+          load_block<block_size>(b, b_block, K, N, k, j, true);
+        }
+
+        // Multiply and accumulate
+        for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
+          for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
+            for (int kk = 0; kk < block_size; kk += simd_size) {
+              auto av =
+                  simd::load<AccT, simd_size>(a_block + ii * block_size + kk);
+              auto bv =
+                  simd::load<AccT, simd_size>(b_block + jj * block_size + kk);
+              c_block[ii * block_size + jj] += simd::sum(av * bv);
+            }
+          }
+        }
+      }
+      if (last_k_block_size) {
+        // Load a and b blocks
+        if (a_trans) {
+          load_block<block_size>(a, a_block, K, M, k, i, true);
+        } else {
+          load_block<block_size>(a, a_block, M, K, i, k, false);
+        }
+        if (b_trans) {
+          load_block<block_size>(b, b_block, N, K, j, k, false);
+        } else {
+          load_block<block_size>(b, b_block, K, N, k, j, true);
+        }
+
+        // Multiply and accumulate
+        for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
+          for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
+            int kk = 0;
+            for (; kk < last_k_simd_block; kk += simd_size) {
+              auto av =
+                  simd::load<AccT, simd_size>(a_block + ii * block_size + kk);
+              auto bv =
+                  simd::load<AccT, simd_size>(b_block + jj * block_size + kk);
+              c_block[ii * block_size + jj] += simd::sum(av * bv);
+            }
+            for (; kk < last_k_block_size; ++kk) {
+              c_block[ii * block_size + jj] +=
+                  a_block[ii * block_size + kk] * b_block[jj * block_size + kk];
+            }
+          }
+        }
+      }
+
+      // Store
+      for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
+        for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
+          auto c_idx = (i * block_size + ii) * N + j * block_size + jj;
+          if (beta != 0) {
+            c[c_idx] = static_cast<T>(
+                alpha * c_block[ii * block_size + jj] + beta * c[c_idx]);
+          } else {
+            c[c_idx] = static_cast<T>(alpha * c_block[ii * block_size + jj]);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/hadamard.cpp
+++ b/mlx/backend/cpu/hadamard.cpp
@@ -4,16 +4,17 @@

 #include "mlx/backend/common/hadamard.h"
 #include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/primitives.h"

 namespace mlx::core {

 // n = 2^k component
 template <typename T>
-void hadamard_n(array& out, int n, int m, float scale) {
-  for (int b = 0; b < out.size() / n; b++) {
+void hadamard_n(T* out, int n, int m, float scale, size_t size) {
+  for (int b = 0; b < size / n; b++) {
    size_t loc = b * n;
-    T* data_ptr = out.data<T>() + loc;
+    T* data_ptr = out + loc;
    int h = 1;
    int n_over_2 = n / 2;
    while (h < n) {
@@ -36,7 +37,7 @@ void hadamard_n(array& out, int n, int m, float scale) {

 // m component
 template <typename T>
-void hadamard_m(array& out, int n, int m, float scale) {
+void hadamard_m(T* out, int n, int m, float scale, size_t size) {
  auto h_matrices = hadamard_matrices();
  auto& matrix = h_matrices[m];
  auto start = 1;
@@ -51,9 +52,9 @@ void hadamard_m(array& out, int n, int m, float scale) {
    end = matrix.find('\n', start);
  }

-  for (int b = 0; b < out.size() / m / n; b++) {
+  for (int b = 0; b < size / m / n; b++) {
    size_t loc = b * n * m;
-    T* data_ptr = out.data<T>() + loc;
+    T* data_ptr = out + loc;
    for (int i = 0; i < n; i++) {
      std::vector<float> out(m);
      for (int j = 0; j < m; j++) {
@@ -74,12 +75,17 @@ void hadamard_m(array& out, int n, int m, float scale) {
 }

 template <typename T>
-void hadamard(array& out, int n, int m, float scale) {
-  float n_scale = m > 1 ? 1.0 : scale;
-  hadamard_n<T>(out, n, m, n_scale);
-  if (m > 1) {
-    hadamard_m<T>(out, n, m, scale);
-  }
+void hadamard(array& out, int n, int m, float scale, Stream stream) {
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_output_array(out);
+  auto out_ptr = out.data<T>();
+  encoder.dispatch([out_ptr, size = out.size(), n, m, scale]() {
+    float n_scale = m > 1 ? 1.0 : scale;
+    hadamard_n<T>(out_ptr, n, m, n_scale, size);
+    if (m > 1) {
+      hadamard_m<T>(out_ptr, n, m, scale, size);
+    }
+  });
 }

 void Hadamard::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -87,18 +93,26 @@ void Hadamard::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];

  // Copy input to output
-  copy(in, out, CopyType::General);
+  if (in.flags().row_contiguous && in.is_donatable()) {
+    out.copy_shared_buffer(in);
+  } else {
+    copy_cpu(
+        in,
+        out,
+        in.flags().row_contiguous ? CopyType::Vector : CopyType::General,
+        stream());
+  }

  int axis = out.ndim() - 1;
  auto [n, m] = decompose_hadamard(out.shape(axis));

  switch (in.dtype()) {
    case float32:
-      return hadamard<float>(out, n, m, scale_);
+      return hadamard<float>(out, n, m, scale_, stream());
    case float16:
-      return hadamard<float16_t>(out, n, m, scale_);
+      return hadamard<float16_t>(out, n, m, scale_, stream());
    case bfloat16:
-      return hadamard<bfloat16_t>(out, n, m, scale_);
+      return hadamard<bfloat16_t>(out, n, m, scale_, stream());
    default:
      throw std::invalid_argument("[hadamard] Unsupported type.");
  }
--- a/mlx/backend/cpu/indexing.cpp
+++ b/mlx/backend/cpu/indexing.cpp
@@ -8,6 +8,7 @@

 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"

 namespace mlx::core {

@@ -21,6 +22,40 @@ inline size_t offset_neg_idx(uint32_t idx, size_t) {
  return idx;
 }

+struct None {
+  template <typename T>
+  void operator()(T x, T* y) {
+    (*y) = x;
+  }
+};
+struct Sum {
+  template <typename T>
+  void operator()(T x, T* y) {
+    (*y) += x;
+  }
+};
+
+struct Prod {
+  template <typename T>
+  void operator()(T x, T* y) {
+    (*y) *= x;
+  }
+};
+
+struct Max {
+  template <typename T>
+  void operator()(T x, T* y) {
+    (*y) = (*y > x) ? *y : x;
+  }
+};
+
+struct Min {
+  template <typename T>
+  void operator()(T x, T* y) {
+    (*y) = (*y < x) ? *y : x;
+  }
+};
+
 template <typename T, typename IdxT>
 void gather(
    const array& src,
@@ -73,13 +108,14 @@ void gather(
  size_t ind_size = slice_size == 0 ? 0 : out.size() / slice_size;
  const T* src_ptr = src.data<T>();
  T* dst_ptr = out.data<T>();
-  size_t out_idx = 0;

  std::vector<ContiguousIterator> its(inds.begin(), inds.end());
  ContiguousIterator src_it;
  if (!can_copy && src.ndim() > 0) {
    src_it = ContiguousIterator(slice_sizes, src.strides(), src.ndim());
  }
+
+  size_t out_idx = 0;
  for (int idx = 0; idx < ind_size; idx++) {
    size_t src_idx = 0;
    for (int ii = 0; ii < inds.size(); ++ii) {
@@ -161,46 +197,59 @@ void dispatch_gather(
 }

 void Gather::eval_cpu(const std::vector<array>& inputs, array& out) {
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  out.set_data(allocator::malloc(out.nbytes()));

  auto& src = inputs[0];
-  std::vector<array> inds(inputs.begin() + 1, inputs.end());
-
-  if (inds.empty()) {
-    dispatch_gather<uint8_t>(src, inds, out, axes_, slice_sizes_);
-    return;
+  std::vector<array> inds;
+  for (auto it = inputs.begin() + 1; it < inputs.end(); ++it) {
+    inds.push_back(array::unsafe_weak_copy(*it));
  }
-
-  switch (inds[0].dtype()) {
-    case uint8:
+  auto& encoder = cpu::get_command_encoder(stream());
+  for (auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  encoder.set_output_array(out);
+  encoder.dispatch([axes_ = axes_,
+                    slice_sizes_ = slice_sizes_,
+                    src = array::unsafe_weak_copy(src),
+                    inds = std::move(inds),
+                    out = array::unsafe_weak_copy(out)]() mutable {
+    if (inds.empty()) {
      dispatch_gather<uint8_t>(src, inds, out, axes_, slice_sizes_);
-      break;
-    case uint16:
-      dispatch_gather<uint16_t>(src, inds, out, axes_, slice_sizes_);
-      break;
-    case uint32:
-      dispatch_gather<uint32_t>(src, inds, out, axes_, slice_sizes_);
-      break;
-    case uint64:
-      dispatch_gather<uint64_t>(src, inds, out, axes_, slice_sizes_);
-      break;
-    case int8:
-      dispatch_gather<int8_t>(src, inds, out, axes_, slice_sizes_);
-      break;
-    case int16:
-      dispatch_gather<int16_t>(src, inds, out, axes_, slice_sizes_);
-      break;
-    case int32:
-      dispatch_gather<int32_t>(src, inds, out, axes_, slice_sizes_);
-      break;
-    case int64:
-      dispatch_gather<int64_t>(src, inds, out, axes_, slice_sizes_);
-      break;
-    default:
-      throw std::runtime_error(
-          "[Gather::eval_cpu] Cannot gather with indices type.");
-      break;
-  }
+      return;
+    }
+
+    switch (inds[0].dtype()) {
+      case uint8:
+        dispatch_gather<uint8_t>(src, inds, out, axes_, slice_sizes_);
+        break;
+      case uint16:
+        dispatch_gather<uint16_t>(src, inds, out, axes_, slice_sizes_);
+        break;
+      case uint32:
+        dispatch_gather<uint32_t>(src, inds, out, axes_, slice_sizes_);
+        break;
+      case uint64:
+        dispatch_gather<uint64_t>(src, inds, out, axes_, slice_sizes_);
+        break;
+      case int8:
+        dispatch_gather<int8_t>(src, inds, out, axes_, slice_sizes_);
+        break;
+      case int16:
+        dispatch_gather<int16_t>(src, inds, out, axes_, slice_sizes_);
+        break;
+      case int32:
+        dispatch_gather<int32_t>(src, inds, out, axes_, slice_sizes_);
+        break;
+      case int64:
+        dispatch_gather<int64_t>(src, inds, out, axes_, slice_sizes_);
+        break;
+      default:
+        throw std::runtime_error(
+            "[Gather::eval_cpu] Cannot gather with indices type.");
+        break;
+    }
+  });
 }
 template <typename T, typename IdxT>
 void gather_axis(
@@ -208,15 +257,11 @@ void gather_axis(
    const array& ind,
    array& out,
    const int axis) {
-  auto strides = ind.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = ind.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator ind_it(shape, strides, src.ndim() - 1);
-
-  strides = src.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator src_it(shape, strides, src.ndim() - 1);
+  auto shape = remove_index(ind.shape(), axis);
+  ContiguousIterator ind_it(
+      shape, remove_index(ind.strides(), axis), src.ndim() - 1);
+  ContiguousIterator src_it(
+      shape, remove_index(src.strides(), axis), src.ndim() - 1);

  auto ind_ptr = ind.data<IdxT>();
  auto src_ptr = src.data<T>();
@@ -235,6 +280,7 @@ void gather_axis(
  for (int i = axis + 1; i < ind.ndim(); ++i) {
    size_post *= ind.shape(i);
  }
+
  size_t stride_pre = size_post * ind_ax_size;
  for (size_t i = 0; i < size_pre; i++) {
    for (size_t k = 0; k < size_post; k++) {
@@ -304,39 +350,49 @@ void dispatch_gather_axis(
 }

 void GatherAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  out.set_data(allocator::malloc(out.nbytes()));
+
  auto& src = inputs[0];
  auto& inds = inputs[1];
-  switch (inds.dtype()) {
-    case uint8:
-      dispatch_gather_axis<uint8_t>(src, inds, out, axis_);
-      break;
-    case uint16:
-      dispatch_gather_axis<uint16_t>(src, inds, out, axis_);
-      break;
-    case uint32:
-      dispatch_gather_axis<uint32_t>(src, inds, out, axis_);
-      break;
-    case uint64:
-      dispatch_gather_axis<uint64_t>(src, inds, out, axis_);
-      break;
-    case int8:
-      dispatch_gather_axis<int8_t>(src, inds, out, axis_);
-      break;
-    case int16:
-      dispatch_gather_axis<int16_t>(src, inds, out, axis_);
-      break;
-    case int32:
-      dispatch_gather_axis<int32_t>(src, inds, out, axis_);
-      break;
-    case int64:
-      dispatch_gather_axis<int64_t>(src, inds, out, axis_);
-      break;
-    default:
-      throw std::runtime_error(
-          "[GatherAxis::eval_cpu] Cannot gather with indices type.");
-      break;
-  }
+  auto& encoder = cpu::get_command_encoder(stream());
+  encoder.set_input_array(src);
+  encoder.set_input_array(inds);
+  encoder.set_output_array(out);
+  encoder.dispatch([axis_ = axis_,
+                    src = array::unsafe_weak_copy(src),
+                    inds = array::unsafe_weak_copy(inds),
+                    out = array::unsafe_weak_copy(out)]() mutable {
+    switch (inds.dtype()) {
+      case uint8:
+        dispatch_gather_axis<uint8_t>(src, inds, out, axis_);
+        break;
+      case uint16:
+        dispatch_gather_axis<uint16_t>(src, inds, out, axis_);
+        break;
+      case uint32:
+        dispatch_gather_axis<uint32_t>(src, inds, out, axis_);
+        break;
+      case uint64:
+        dispatch_gather_axis<uint64_t>(src, inds, out, axis_);
+        break;
+      case int8:
+        dispatch_gather_axis<int8_t>(src, inds, out, axis_);
+        break;
+      case int16:
+        dispatch_gather_axis<int16_t>(src, inds, out, axis_);
+        break;
+      case int32:
+        dispatch_gather_axis<int32_t>(src, inds, out, axis_);
+        break;
+      case int64:
+        dispatch_gather_axis<int64_t>(src, inds, out, axis_);
+        break;
+      default:
+        throw std::runtime_error(
+            "[GatherAxis::eval_cpu] Cannot gather with indices type.");
+        break;
+    }
+  });
 }

 template <typename InT, typename IdxT, typename OpT>
@@ -344,8 +400,7 @@ void scatter(
    const array& updates,
    array& out,
    const std::vector<array>& inds,
-    const std::vector<int>& axes,
-    const OpT& op) {
+    const std::vector<int>& axes) {
  int nind = inds.size();
  auto inds_ndim = updates.ndim() - out.ndim();
  size_t n_updates = nind ? inds[0].size() : 1;
@@ -361,9 +416,11 @@ void scatter(
  ContiguousIterator update_it(updates);
  ContiguousIterator out_it(update_shape, out.strides(), out.ndim());

+  auto out_ptr = out.data<InT>();
+  auto upd_ptr = updates.data<InT>();
  for (int i = 0; i < n_updates; ++i) {
    size_t out_offset = 0;
-    for (int j = 0; j < nind; ++j) {
+    for (int j = 0; j < inds.size(); ++j) {
      auto ax = axes[j];
      auto idx_loc = its[j].loc;
      its[j].step();
@@ -373,8 +430,7 @@ void scatter(
    }
    update_it.seek(i * update_size);
    for (int j = 0; j < update_size; ++j) {
-      op(updates.data<InT>()[update_it.loc],
-         out.data<InT>() + out_offset + out_it.loc);
+      OpT{}(upd_ptr[update_it.loc], out_ptr + out_offset + out_it.loc);
      update_it.step();
      out_it.step();
    }
@@ -392,26 +448,19 @@ void dispatch_scatter_inds(
    Scatter::ReduceType rtype) {
  switch (rtype) {
    case Scatter::None:
-      scatter<InT, IdxT>(
-          updates, out, indices, axes, [](auto x, auto* y) { (*y) = x; });
+      scatter<InT, IdxT, None>(updates, out, indices, axes);
      break;
    case Scatter::Sum:
-      scatter<InT, IdxT>(
-          updates, out, indices, axes, [](auto x, auto* y) { (*y) += x; });
+      scatter<InT, IdxT, Sum>(updates, out, indices, axes);
      break;
    case Scatter::Prod:
-      scatter<InT, IdxT>(
-          updates, out, indices, axes, [](auto x, auto* y) { (*y) *= x; });
+      scatter<InT, IdxT, Prod>(updates, out, indices, axes);
      break;
    case Scatter::Max:
-      scatter<InT, IdxT>(updates, out, indices, axes, [](auto x, auto* y) {
-        (*y) = (*y > x) ? *y : x;
-      });
+      scatter<InT, IdxT, Max>(updates, out, indices, axes);
      break;
    case Scatter::Min:
-      scatter<InT, IdxT>(updates, out, indices, axes, [](auto x, auto* y) {
-        (*y) = (*y < x) ? *y : x;
-      });
+      scatter<InT, IdxT, Min>(updates, out, indices, axes);
      break;
  }
 }
@@ -463,76 +512,80 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() >= 2);

  auto& src = inputs[0];
-  std::vector<array> inds(inputs.begin() + 1, inputs.end() - 1);
  auto& updates = inputs.back();

  // Copy src into out (copy allocates memory for out)
  auto ctype =
      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy(src, out, ctype);
+  copy_cpu(src, out, ctype, stream());

-  switch (src.dtype()) {
-    case bool_:
-      dispatch_scatter<bool>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case uint8:
-      dispatch_scatter<uint8_t>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case uint16:
-      dispatch_scatter<uint16_t>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case uint32:
-      dispatch_scatter<uint32_t>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case uint64:
-      dispatch_scatter<uint64_t>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case int8:
-      dispatch_scatter<int8_t>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case int16:
-      dispatch_scatter<int16_t>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case int32:
-      dispatch_scatter<int32_t>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case int64:
-      dispatch_scatter<int64_t>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case float16:
-      dispatch_scatter<float16_t>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case float32:
-      dispatch_scatter<float>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case float64:
-      dispatch_scatter<double>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case bfloat16:
-      dispatch_scatter<bfloat16_t>(out, inds, updates, axes_, reduce_type_);
-      break;
-    case complex64:
-      dispatch_scatter<complex64_t>(out, inds, updates, axes_, reduce_type_);
-      break;
+  auto& encoder = cpu::get_command_encoder(stream());
+  std::vector<array> inds;
+  for (auto it = inputs.begin() + 1; it < inputs.end() - 1; ++it) {
+    encoder.set_input_array(*it);
+    inds.push_back(array::unsafe_weak_copy(*it));
  }
+  encoder.set_input_array(updates);
+  encoder.set_output_array(out);
+  encoder.dispatch([axes_ = axes_,
+                    reduce_type_ = reduce_type_,
+                    updates = array::unsafe_weak_copy(updates),
+                    inds = std::move(inds),
+                    out = array::unsafe_weak_copy(out)]() mutable {
+    switch (out.dtype()) {
+      case bool_:
+        dispatch_scatter<bool>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case uint8:
+        dispatch_scatter<uint8_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case uint16:
+        dispatch_scatter<uint16_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case uint32:
+        dispatch_scatter<uint32_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case uint64:
+        dispatch_scatter<uint64_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case int8:
+        dispatch_scatter<int8_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case int16:
+        dispatch_scatter<int16_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case int32:
+        dispatch_scatter<int32_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case int64:
+        dispatch_scatter<int64_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case float16:
+        dispatch_scatter<float16_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case float32:
+        dispatch_scatter<float>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case float64:
+        dispatch_scatter<double>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case bfloat16:
+        dispatch_scatter<bfloat16_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+      case complex64:
+        dispatch_scatter<complex64_t>(out, inds, updates, axes_, reduce_type_);
+        break;
+    }
+  });
 }

 template <typename T, typename IdxT, typename OpT>
-void scatter_axis(
-    array& out,
-    const array idx,
-    const array& upd,
-    int axis,
-    const OpT& op) {
-  auto strides = idx.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = idx.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator idx_it(shape, strides, upd.ndim() - 1);
-
-  strides = upd.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator upd_it(shape, strides, upd.ndim() - 1);
+void scatter_axis(array& out, const array idx, const array& upd, int axis) {
+  auto shape = remove_index(idx.shape(), axis);
+  ContiguousIterator idx_it(
+      shape, remove_index(idx.strides(), axis), upd.ndim() - 1);
+  ContiguousIterator upd_it(
+      shape, remove_index(upd.strides(), axis), upd.ndim() - 1);

  auto idx_ptr = idx.data<IdxT>();
  auto upd_ptr = upd.data<T>();
@@ -557,8 +610,9 @@ void scatter_axis(
      for (int j = 0; j < idx_ax_size; ++j) {
        auto ind_val = offset_neg_idx(
            idx_ptr[idx_it.loc + j * idx_ax_stride], dst_ax_size);
-        op(upd_ptr[upd_it.loc + j * upd_ax_stride],
-           dst_ptr + k + ind_val * dst_ax_stride);
+        OpT{}(
+            upd_ptr[upd_it.loc + j * upd_ax_stride],
+            dst_ptr + k + ind_val * dst_ax_stride);
      }
      idx_it.step();
      upd_it.step();
@@ -576,12 +630,10 @@ void dispatch_scatter_axis_op(
    ScatterAxis::ReduceType rtype) {
  switch (rtype) {
    case ScatterAxis::None:
-      scatter_axis<InT, IdxT>(
-          out, idx, updates, axis, [](auto x, auto* y) { (*y) = x; });
+      scatter_axis<InT, IdxT, None>(out, idx, updates, axis);
      break;
    case ScatterAxis::Sum:
-      scatter_axis<InT, IdxT>(
-          out, idx, updates, axis, [](auto x, auto* y) { (*y) += x; });
+      scatter_axis<InT, IdxT, Sum>(out, idx, updates, axis);
      break;
  }
 }
@@ -634,53 +686,65 @@ void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
  // Copy src into out (copy allocates memory for out)
  auto ctype =
      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy(src, out, ctype);
+  copy_cpu(src, out, ctype, stream());

-  switch (src.dtype()) {
-    case bool_:
-      dispatch_scatter_axis<bool>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint8:
-      dispatch_scatter_axis<uint8_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint16:
-      dispatch_scatter_axis<uint16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint32:
-      dispatch_scatter_axis<uint32_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint64:
-      dispatch_scatter_axis<uint64_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int8:
-      dispatch_scatter_axis<int8_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int16:
-      dispatch_scatter_axis<int16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int32:
-      dispatch_scatter_axis<int32_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int64:
-      dispatch_scatter_axis<int64_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case float16:
-      dispatch_scatter_axis<float16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case float32:
-      dispatch_scatter_axis<float>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case float64:
-      dispatch_scatter_axis<double>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case bfloat16:
-      dispatch_scatter_axis<bfloat16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case complex64:
-      dispatch_scatter_axis<complex64_t>(
-          out, idx, updates, axis_, reduce_type_);
-      break;
-  }
+  auto& encoder = cpu::get_command_encoder(stream());
+  encoder.set_input_array(idx);
+  encoder.set_input_array(updates);
+  encoder.set_output_array(out);
+  encoder.dispatch([axis_ = axis_,
+                    reduce_type_ = reduce_type_,
+                    idx = array::unsafe_weak_copy(idx),
+                    updates = array::unsafe_weak_copy(updates),
+                    out = array::unsafe_weak_copy(out)]() mutable {
+    switch (out.dtype()) {
+      case bool_:
+        dispatch_scatter_axis<bool>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case uint8:
+        dispatch_scatter_axis<uint8_t>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case uint16:
+        dispatch_scatter_axis<uint16_t>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case uint32:
+        dispatch_scatter_axis<uint32_t>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case uint64:
+        dispatch_scatter_axis<uint64_t>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case int8:
+        dispatch_scatter_axis<int8_t>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case int16:
+        dispatch_scatter_axis<int16_t>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case int32:
+        dispatch_scatter_axis<int32_t>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case int64:
+        dispatch_scatter_axis<int64_t>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case float16:
+        dispatch_scatter_axis<float16_t>(
+            out, idx, updates, axis_, reduce_type_);
+        break;
+      case float32:
+        dispatch_scatter_axis<float>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case float64:
+        dispatch_scatter_axis<double>(out, idx, updates, axis_, reduce_type_);
+        break;
+      case bfloat16:
+        dispatch_scatter_axis<bfloat16_t>(
+            out, idx, updates, axis_, reduce_type_);
+        break;
+      case complex64:
+        dispatch_scatter_axis<complex64_t>(
+            out, idx, updates, axis_, reduce_type_);
+        break;
+    }
+  });
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/inverse.cpp
+++ b/mlx/backend/cpu/inverse.cpp
@@ -2,20 +2,21 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/lapack.h"
 #include "mlx/primitives.h"

 namespace mlx::core {

 template <typename T>
-void general_inv(array& inv, int N, int i) {
+void general_inv(T* inv, int N) {
  int info;
-  auto ipiv = array::Data{allocator::malloc_or_wait(sizeof(int) * N)};
+  auto ipiv = array::Data{allocator::malloc(sizeof(int) * N)};
  // Compute LU factorization.
  getrf<T>(
      /* m = */ &N,
      /* n = */ &N,
-      /* a = */ inv.data<T>() + N * N * i,
+      /* a = */ inv,
      /* lda = */ &N,
      /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
      /* info = */ &info);
@@ -48,12 +49,12 @@ void general_inv(array& inv, int N, int i) {
  }

  const int lwork = workspace_size;
-  auto scratch = array::Data{allocator::malloc_or_wait(sizeof(T) * lwork)};
+  auto scratch = array::Data{allocator::malloc(sizeof(T) * lwork)};

  // Compute inverse.
  getri<T>(
      /* m = */ &N,
-      /* a = */ inv.data<T>() + N * N * i,
+      /* a = */ inv,
      /* lda = */ &N,
      /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
      /* work = */ static_cast<T*>(scratch.buffer.raw_ptr()),
@@ -68,29 +69,28 @@ void general_inv(array& inv, int N, int i) {
 }

 template <typename T>
-void tri_inv(array& inv, int N, int i, bool upper) {
+void tri_inv(T* inv, int N, bool upper) {
  const char uplo = upper ? 'L' : 'U';
  const char diag = 'N';
-  T* data = inv.data<T>() + N * N * i;
  int info;
  trtri<T>(
      /* uplo = */ &uplo,
      /* diag = */ &diag,
      /* N = */ &N,
-      /* a = */ data,
+      /* a = */ inv,
      /* lda = */ &N,
      /* info = */ &info);

  // zero out the other triangle
  if (upper) {
    for (int i = 0; i < N; i++) {
-      std::fill(data, data + i, 0.0f);
-      data += N;
+      std::fill(inv, inv + i, 0.0f);
+      inv += N;
    }
  } else {
    for (int i = 0; i < N; i++) {
-      std::fill(data + i + 1, data + N, 0.0f);
-      data += N;
+      std::fill(inv + i + 1, inv + N, 0.0f);
+      inv += N;
    }
  }

@@ -103,34 +103,53 @@ void tri_inv(array& inv, int N, int i, bool upper) {
 }

 template <typename T>
-void inverse_impl(const array& a, array& inv, bool tri, bool upper) {
+void inverse_impl(
+    const array& a,
+    array& inv,
+    bool tri,
+    bool upper,
+    Stream stream) {
  // Lapack uses the column-major convention. We take advantage of the following
  // identity to avoid transposing (see
  // https://math.stackexchange.com/a/340234):
  //   (A⁻¹)ᵀ = (Aᵀ)⁻¹

  // The inverse is computed in place, so just copy the input to the output.
-  copy(a, inv, a.flags().row_contiguous ? CopyType::Vector : CopyType::General);
+  copy_cpu(
+      a,
+      inv,
+      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
+      stream);

  const int N = a.shape(-1);
  const size_t num_matrices = a.size() / (N * N);

-  for (int i = 0; i < num_matrices; i++) {
-    if (tri) {
-      tri_inv<T>(inv, N, i, upper);
-    } else {
-      general_inv<T>(inv, N, i);
-    }
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_output_array(inv);
+
+  auto inv_ptr = inv.data<T>();
+  if (tri) {
+    encoder.dispatch([inv_ptr, N, num_matrices, upper]() {
+      for (int i = 0; i < num_matrices; i++) {
+        tri_inv<T>(inv_ptr + N * N * i, N, upper);
+      }
+    });
+  } else {
+    encoder.dispatch([inv_ptr, N, num_matrices]() {
+      for (int i = 0; i < num_matrices; i++) {
+        general_inv<T>(inv_ptr + N * N * i, N);
+      }
+    });
  }
 }

 void Inverse::eval_cpu(const std::vector<array>& inputs, array& output) {
  switch (inputs[0].dtype()) {
    case float32:
-      inverse_impl<float>(inputs[0], output, tri_, upper_);
+      inverse_impl<float>(inputs[0], output, tri_, upper_, stream());
      break;
    case float64:
-      inverse_impl<double>(inputs[0], output, tri_, upper_);
+      inverse_impl<double>(inputs[0], output, tri_, upper_, stream());
      break;
    default:
      throw std::runtime_error(
--- a/mlx/backend/cpu/jit_compiler.cpp
+++ b/mlx/backend/cpu/jit_compiler.cpp
@@ -2,6 +2,7 @@

 #include "mlx/backend/cpu/jit_compiler.h"

+#include <algorithm>
 #include <sstream>
 #include <vector>

--- a/mlx/backend/cpu/lapack.h
+++ b/mlx/backend/cpu/lapack.h
@@ -2,14 +2,14 @@

 #pragma once

-// Required for Visual Studio.
-// https://github.com/OpenMathLib/OpenBLAS/blob/develop/docs/install.md
-#ifdef _MSC_VER
 #include <complex>
 #define LAPACK_COMPLEX_CUSTOM
 #define lapack_complex_float std::complex<float>
 #define lapack_complex_double std::complex<double>
-#endif
+#define lapack_complex_float_real(z) ((z).real())
+#define lapack_complex_float_imag(z) ((z).imag())
+#define lapack_complex_double_real(z) ((z).real())
+#define lapack_complex_double_imag(z) ((z).imag())

 #ifdef MLX_USE_ACCELERATE
 #include <Accelerate/Accelerate.h>
@@ -32,7 +32,7 @@

 #endif

-#define INSTANTIATE_LAPACK_TYPES(FUNC)                       \
+#define INSTANTIATE_LAPACK_REAL(FUNC)                        \
  template <typename T, typename... Args>                    \
  void FUNC(Args... args) {                                  \
    if constexpr (std::is_same_v<T, float>) {                \
@@ -42,11 +42,24 @@
    }                                                        \
  }

-INSTANTIATE_LAPACK_TYPES(geqrf)
-INSTANTIATE_LAPACK_TYPES(orgqr)
-INSTANTIATE_LAPACK_TYPES(syevd)
-INSTANTIATE_LAPACK_TYPES(potrf)
-INSTANTIATE_LAPACK_TYPES(gesvdx)
-INSTANTIATE_LAPACK_TYPES(getrf)
-INSTANTIATE_LAPACK_TYPES(getri)
-INSTANTIATE_LAPACK_TYPES(trtri)
+INSTANTIATE_LAPACK_REAL(geqrf)
+INSTANTIATE_LAPACK_REAL(orgqr)
+INSTANTIATE_LAPACK_REAL(syevd)
+INSTANTIATE_LAPACK_REAL(geev)
+INSTANTIATE_LAPACK_REAL(potrf)
+INSTANTIATE_LAPACK_REAL(gesvdx)
+INSTANTIATE_LAPACK_REAL(getrf)
+INSTANTIATE_LAPACK_REAL(getri)
+INSTANTIATE_LAPACK_REAL(trtri)
+
+#define INSTANTIATE_LAPACK_COMPLEX(FUNC)                            \
+  template <typename T, typename... Args>                           \
+  void FUNC(Args... args) {                                         \
+    if constexpr (std::is_same_v<T, std::complex<float>>) {         \
+      MLX_LAPACK_FUNC(c##FUNC)(std::forward<Args>(args)...);        \
+    } else if constexpr (std::is_same_v<T, std::complex<double>>) { \
+      MLX_LAPACK_FUNC(z##FUNC)(std::forward<Args>(args)...);        \
+    }                                                               \
+  }
+
+INSTANTIATE_LAPACK_COMPLEX(heevd)
--- a/mlx/backend/cpu/logsumexp.cpp
+++ b/mlx/backend/cpu/logsumexp.cpp
@@ -0,0 +1,139 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cassert>
+#include <cmath>
+
+#include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
+#include "mlx/backend/cpu/simd/simd.h"
+#include "mlx/primitives.h"
+#include "mlx/types/limits.h"
+
+namespace mlx::core {
+
+namespace {
+
+using namespace mlx::core::simd;
+
+template <typename T, typename AccT>
+void logsumexp(const array& in, array& out, Stream stream) {
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+
+  const T* in_ptr = in.data<T>();
+  T* out_ptr = out.data<T>();
+
+  int M = in.shape().back();
+  int L = in.data_size() / M;
+
+  encoder.dispatch([in_ptr, out_ptr, M, L]() mutable {
+    constexpr int N = std::min(max_size<AccT>, max_size<T>);
+
+    const T* current_in_ptr;
+
+    for (int i = 0; i < L; i++, in_ptr += M, out_ptr += 1) {
+      // Find the maximum
+      current_in_ptr = in_ptr;
+      Simd<AccT, N> vmaximum(-numeric_limits<AccT>::infinity());
+      size_t s = M;
+      while (s >= N) {
+        Simd<AccT, N> vals = load<T, N>(current_in_ptr);
+        vmaximum = maximum(vals, vmaximum);
+        current_in_ptr += N;
+        s -= N;
+      }
+
+      AccT maximum = max(vmaximum);
+      while (s-- > 0) {
+        maximum = std::max(maximum, static_cast<AccT>(*current_in_ptr));
+        current_in_ptr++;
+      }
+
+      // Compute the normalizer and the exponentials
+      Simd<AccT, N> vnormalizer(0.0);
+      current_in_ptr = in_ptr;
+      s = M;
+      while (s >= N) {
+        Simd<AccT, N> vexp = load<T, N>(current_in_ptr);
+        vexp = exp(vexp - maximum);
+        vnormalizer = vnormalizer + vexp;
+        current_in_ptr += N;
+        s -= N;
+      }
+      AccT normalizer = sum(vnormalizer);
+      while (s-- > 0) {
+        AccT _exp = std::exp(*current_in_ptr - maximum);
+        normalizer += _exp;
+        current_in_ptr++;
+      }
+      // Normalize
+      *out_ptr = std::isinf(maximum)
+          ? static_cast<T>(maximum)
+          : static_cast<T>(std::log(normalizer) + maximum);
+    }
+  });
+}
+
+} // namespace
+
+void LogSumExp::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+
+  // Make sure that the last dimension is contiguous
+  auto s = stream();
+  auto& encoder = cpu::get_command_encoder(s);
+  auto ensure_contiguous = [&s, &encoder](const array& x) {
+    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
+      return x;
+    } else {
+      array x_copy = contiguous_copy_cpu(x, s);
+      encoder.add_temporary(x_copy);
+      return x_copy;
+    }
+  };
+
+  auto in = ensure_contiguous(inputs[0]);
+  if (in.flags().row_contiguous) {
+    out.set_data(allocator::malloc(out.nbytes()));
+  } else {
+    auto n = in.shape(-1);
+    auto flags = in.flags();
+    auto strides = in.strides();
+    for (auto& s : strides) {
+      s /= n;
+    }
+    bool col_contig = strides[0] == 1;
+    for (int i = 1; col_contig && i < strides.size(); ++i) {
+      col_contig &=
+          (out.shape(i) == 1 || strides[i - 1] == out.shape(i) * strides[i]);
+    }
+    flags.col_contiguous = col_contig;
+    out.set_data(
+        allocator::malloc(in.nbytes() / n),
+        in.data_size() / n,
+        std::move(strides),
+        flags);
+  }
+
+  switch (in.dtype()) {
+    case float32:
+      logsumexp<float, float>(in, out, stream());
+      break;
+    case float16:
+      logsumexp<float16_t, float>(in, out, stream());
+      break;
+    case bfloat16:
+      logsumexp<bfloat16_t, float>(in, out, stream());
+      break;
+    case float64:
+      logsumexp<double, double>(in, out, stream());
+      break;
+    default:
+      throw std::runtime_error(
+          "[logsumexp] only supports floating point types");
+      break;
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/luf.cpp
+++ b/mlx/backend/cpu/luf.cpp
@@ -4,15 +4,22 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/lapack.h"
 #include "mlx/primitives.h"

 namespace mlx::core {

 template <typename T>
-void luf_impl(const array& a, array& lu, array& pivots, array& row_indices) {
+void luf_impl(
+    const array& a,
+    array& lu,
+    array& pivots,
+    array& row_indices,
+    Stream stream) {
  int M = a.shape(-2);
  int N = a.shape(-1);
+  int K = std::min(M, N);

  // Copy a into lu and make it col contiguous
  auto ndim = lu.ndim();
@@ -23,60 +30,74 @@ void luf_impl(const array& a, array& lu, array& pivots, array& row_indices) {
  auto strides = lu.strides();
  strides[ndim - 1] = M;
  strides[ndim - 2] = 1;
-  lu.set_data(
-      allocator::malloc_or_wait(lu.nbytes()), lu.nbytes(), strides, flags);
-  copy_inplace(
-      a, lu, a.shape(), a.strides(), strides, 0, 0, CopyType::GeneralGeneral);
+  lu.set_data(allocator::malloc(lu.nbytes()), lu.nbytes(), strides, flags);
+  copy_cpu_inplace(
+      a,
+      lu,
+      a.shape(),
+      a.strides(),
+      strides,
+      0,
+      0,
+      CopyType::GeneralGeneral,
+      stream);

  auto a_ptr = lu.data<T>();
-
-  pivots.set_data(allocator::malloc_or_wait(pivots.nbytes()));
-  row_indices.set_data(allocator::malloc_or_wait(row_indices.nbytes()));
+  pivots.set_data(allocator::malloc(pivots.nbytes()));
+  row_indices.set_data(allocator::malloc(row_indices.nbytes()));
  auto pivots_ptr = pivots.data<uint32_t>();
  auto row_indices_ptr = row_indices.data<uint32_t>();
-
-  int info;
  size_t num_matrices = a.size() / (M * N);
-  for (size_t i = 0; i < num_matrices; ++i) {
-    // Compute LU factorization of A
-    getrf<T>(
-        /* m */ &M,
-        /* n */ &N,
-        /* a */ a_ptr,
-        /* lda */ &M,
-        /* ipiv */ reinterpret_cast<int*>(pivots_ptr),
-        /* info */ &info);
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_output_array(lu);
+  encoder.set_output_array(pivots);
+  encoder.set_output_array(row_indices);

-    if (info != 0) {
-      std::stringstream ss;
-      ss << "[LUF::eval_cpu] sgetrf_ failed with code " << info
-         << ((info > 0) ? " because matrix is singular"
-                        : " because argument had an illegal value");
-      throw std::runtime_error(ss.str());
-    }
+  encoder.dispatch(
+      [a_ptr, pivots_ptr, row_indices_ptr, num_matrices, M, N, K]() mutable {
+        int info;
+        for (size_t i = 0; i < num_matrices; ++i) {
+          // Compute LU factorization of A
+          getrf<T>(
+              /* m */ &M,
+              /* n */ &N,
+              /* a */ a_ptr,
+              /* lda */ &M,
+              /* ipiv */ reinterpret_cast<int*>(pivots_ptr),
+              /* info */ &info);

-    // Subtract 1 to get 0-based index
-    int j = 0;
-    for (; j < pivots.shape(-1); ++j) {
-      pivots_ptr[j]--;
-      row_indices_ptr[j] = j;
-    }
-    for (; j < row_indices.shape(-1); ++j) {
-      row_indices_ptr[j] = j;
-    }
-    for (int j = pivots.shape(-1) - 1; j >= 0; --j) {
-      auto piv = pivots_ptr[j];
-      auto t1 = row_indices_ptr[piv];
-      auto t2 = row_indices_ptr[j];
-      row_indices_ptr[j] = t1;
-      row_indices_ptr[piv] = t2;
-    }
+          if (info != 0) {
+            std::stringstream ss;
+            ss << "[LUF::eval_cpu] sgetrf_ failed with code " << info
+               << ((info > 0) ? " because matrix is singular"
+                              : " because argument had an illegal value");
+            throw std::runtime_error(ss.str());
+          }

-    // Advance pointers to the next matrix
-    a_ptr += M * N;
-    pivots_ptr += pivots.shape(-1);
-    row_indices_ptr += pivots.shape(-1);
-  }
+          // Subtract 1 to get 0-based index
+          int j = 0;
+          for (; j < K; ++j) {
+            pivots_ptr[j]--;
+            row_indices_ptr[j] = j;
+          }
+          for (; j < M; ++j) {
+            row_indices_ptr[j] = j;
+          }
+          for (int j = K - 1; j >= 0; --j) {
+            auto piv = pivots_ptr[j];
+            auto t1 = row_indices_ptr[piv];
+            auto t2 = row_indices_ptr[j];
+            row_indices_ptr[j] = t1;
+            row_indices_ptr[piv] = t2;
+          }
+
+          // Advance pointers to the next matrix
+          a_ptr += M * N;
+          pivots_ptr += K;
+          row_indices_ptr += M;
+        }
+      });
 }

 void LUF::eval_cpu(
@@ -85,10 +106,10 @@ void LUF::eval_cpu(
  assert(inputs.size() == 1);
  switch (inputs[0].dtype()) {
    case float32:
-      luf_impl<float>(inputs[0], outputs[0], outputs[1], outputs[2]);
+      luf_impl<float>(inputs[0], outputs[0], outputs[1], outputs[2], stream());
      break;
    case float64:
-      luf_impl<double>(inputs[0], outputs[0], outputs[1], outputs[2]);
+      luf_impl<double>(inputs[0], outputs[0], outputs[1], outputs[2], stream());
      break;
    default:
      throw std::runtime_error(
--- a/Show More
+++ b/Show More