CMakeLists.txt update

WIP (python)
WIP
2025-12-16 01:49:05 +08:00 · 2025-10-31 16:55:04 -07:00 · 2025-10-31 16:24:51 -07:00 · 2025-10-31 16:24:35 -07:00 · 2025-10-31 16:24:21 -07:00 · 2025-10-31 16:24:09 -07:00
518 changed files with 38350 additions and 8327 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7,15 +7,9 @@ parameters:
  nightly_build:
    type: boolean
    default: false
-  weekly_build:
-    type: boolean
-    default: false
  test_release:
    type: boolean
    default: false
-  linux_release:
-    type: boolean
-    default: false

 jobs:
  build_documentation:
@@ -24,21 +18,22 @@ jobs:
        type: boolean
        default: false
    macos:
-      xcode: "16.2.0"
-    resource_class: m2pro.medium
+      xcode: "26.0.0"
+    resource_class: m4pro.medium
    steps:
      - checkout
      - run:
          name: Install
          command: |
-            brew install python@3.9
+            xcodebuild -downloadComponent MetalToolchain
+            brew install python@3.10
            brew install doxygen
-            python3.9 -m venv env
+            python3.10 -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
            pip install -r docs/requirements.txt
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install . -v
+            pip install . -v
      - when:
          condition:
            not: << parameters.upload-docs >>
@@ -70,9 +65,9 @@ jobs:
                 git push -f origin gh-pages

  linux_build_and_test:
-    docker:
-      - image: cimg/python:3.9
-
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
    steps:
      - checkout
      - run:
@@ -84,37 +79,37 @@ jobs:
      - run:
          name: Install dependencies
          command: |
-            pip install --upgrade cmake
-            pip install nanobind==2.4.0
-            pip install numpy
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
            sudo apt-get update
-            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
            sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
+            curl -LsSf https://astral.sh/uv/install.sh | sh
      - run:
          name: Install Python package
          command: |
-            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python3 setup.py build_ext --inplace
-            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python3 setup.py develop
+            uv venv
+            uv pip install cmake
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+              uv pip install -e ".[dev]" -v
      - run:
          name: Generate package stubs
          command: |
-            echo "stubs"
-            pip install typing_extensions
-            python setup.py generate_stubs 
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
      - run:
          name: Run Python tests
          command: |
-            python3 -m unittest discover python/tests -v
+            source .venv/bin/activate
+            python -m unittest discover python/tests -v
            mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
-            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
      - run:
          name: Build CPP only
          command: |
-            mkdir -p build && cd build 
+            source .venv/bin/activate
+            mkdir -p build && cd build
            cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
            make -j `nproc`
      - run:
@@ -125,7 +120,7 @@ jobs:
    parameters:
      xcode_version:
        type: string
-        default: "16.2.0"
+        default: "26.0.0"
      macosx_deployment_target:
        type: string
        default: ""
@@ -133,57 +128,56 @@ jobs:
      xcode: << parameters.xcode_version >>
    environment:
      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
-    resource_class: m2pro.medium
+    resource_class: m4pro.medium
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
-            brew install python@3.9
-            brew install openmpi
-            python3.9 -m venv env
-            source env/bin/activate
-            pip install --upgrade pip
-            pip install --upgrade cmake
-            pip install nanobind==2.4.0
-            pip install numpy
-            pip install torch
-            pip install tensorflow
-            pip install unittest-xml-reporting
+            xcodebuild -downloadComponent MetalToolchain
+            HOMEBREW_NO_AUTO_UPDATE=1 HOMEBREW_NO_INSTALL_CLEANUP=1 \
+              brew install openmpi uv
      - run:
          name: Install Python package
          command: |
-            source env/bin/activate
-            DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-            CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
-              pip install -e . -v
+            uv venv --python 3.10
+            uv pip install \
+              nanobind==2.4.0 \
+              cmake \
+              numpy \
+              torch \
+              tensorflow \
+              unittest-xml-reporting
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+              uv pip install -e . -v
      - run:
          name: Generate package stubs
          command: |
-            source env/bin/activate
-            pip install typing_extensions
-            python setup.py generate_stubs 
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
      - run:
          name: Run Python tests
          command: |
-            source env/bin/activate
+            source .venv/bin/activate
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
-            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
      - run:
          name: Build example extension
          command: |
-            source env/bin/activate
+            source .venv/bin/activate
            cd examples/extensions
-            pip install -r requirements.txt
-            python setup.py build_ext -j8
+            uv pip install -r requirements.txt
+            uv run --no-project setup.py build_ext --inplace
+            uv run --no-project python test.py
      - store_test_results:
          path: test-results
      - run:
          name: Build CPP only
          command: |
-            source env/bin/activate
+            source .venv/bin/activate
            mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
      - run:
          name: Run CPP tests
@@ -192,7 +186,7 @@ jobs:
      - run:
          name: Build small binary
          command: |
-            source env/bin/activate
+            source .venv/bin/activate
            cd build/
            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
              -DBUILD_SHARED_LIBS=ON \
@@ -204,22 +198,85 @@ jobs:
      - run:
          name: Run Python tests with JIT
          command: |
-            source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
-              pip install -e . -v
+            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+              uv pip install -e . -v
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
              METAL_DEBUG_ERROR_MODE=0 \
-              python -m xmlrunner discover -v python/tests -o test-results/gpu_jit
+              uv run --no-project python -m xmlrunner discover \
+                -v python/tests \
+                -o test-results/gpu_jit
+
+  cuda_build_and_test:
+    parameters:
+      image_date:
+        type: string
+        default: "2023.11.1"
+    machine:
+      image: "linux-cuda-12:<< parameters.image_date >>"
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - restore_cache:
+          keys:
+            - cuda-<< parameters.image_date >>-{{ arch }}-
+      - run:
+          name: Install dependencies
+          command: |
+            sudo apt-get update
+            sudo apt-get install libcudnn9-dev-cuda-12
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install libnccl2 libnccl-dev
+            curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
+            sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
+            rm -rf ccache-4.11.3-linux-x86_64
+            curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run:
+          name: Set CCache size
+          command: ccache --max-size 1G
+      - run:
+          name: Install Python package
+          command: |
+            uv venv
+            uv pip install cmake
+            DEBUG=1 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              uv pip install -e ".[dev]" -v
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
+            LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            cmake . -B build \
+              -DMLX_BUILD_CUDA=ON \
+              -DCMAKE_CUDA_COMPILER=`which nvcc` \
+              -DCMAKE_BUILD_TYPE=DEBUG
+            cmake --build build -j `nproc`
+      - run:
+          name: Run CPP tests
+          command: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
+      - run:
+          name: CCache report
+          command: |
+            ccache --show-stats
+            ccache --zero-stats
+            ccache --cleanup
+      - save_cache:
+          key: cuda-<< parameters.image_date >>-{{ arch }}-{{ epoch }}
+          paths:
+            - /home/circleci/.cache/ccache

  build_release:
    parameters:
      python_version:
        type: string
-        default: "3.9"
+        default: "3.10"
      xcode_version:
        type: string
-        default: "16.2.0"
+        default: "26.0.0"
      build_env:
        type: string
        default: ""
@@ -228,7 +285,7 @@ jobs:
        default: ""
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: m2pro.medium
+    resource_class: m4pro.medium
    environment:
      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
    steps:
@@ -236,11 +293,15 @@ jobs:
      - run:
          name: Install dependencies
          command: |
-            brew install python@<< parameters.python_version >>
-            brew install openmpi
-            python<< parameters.python_version >> -m venv env
-            source env/bin/activate
-            pip install --upgrade pip
+            xcodebuild -downloadComponent MetalToolchain
+            mkdir -p ~/miniconda3
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
+            bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+            rm ~/miniconda3/miniconda.sh
+            source ~/miniconda3/bin/activate
+            conda init --all
+            conda create -n env python=<< parameters.python_version >> -y
+            conda activate env
            pip install --upgrade cmake
            pip install nanobind==2.4.0
            pip install --upgrade setuptools
@@ -250,30 +311,38 @@ jobs:
      - run:
          name: Install Python package
          command: |
-            source env/bin/activate
+            conda activate env
            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
              pip install . -v
      - run:
          name: Generate package stubs
          command: |
-            source env/bin/activate
+            conda activate env
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Build Python package
          command: |
-            source env/bin/activate
-            << parameters.build_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              python -m build -w
+            conda activate env
+            python setup.py clean --all
+            << parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
+      - when:
+          condition:
+            equal: ["3.10", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  conda activate env
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
      - when:
          condition: << parameters.build_env >>
          steps:
            - run:
                name: Upload package
                command: |
-                  source env/bin/activate
+                  conda activate env
                  twine upload dist/*
      - store_artifacts:
          path: dist/
@@ -282,53 +351,101 @@ jobs:
    parameters:
      python_version:
        type: string
-        default: "3.9"
-      extra_env:
+        default: "3.10"
+      build_env:
        type: string
-        default: "DEV_RELEASE=1"
-    docker:
-      - image: ubuntu:20.04
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
    steps:
      - checkout
      - run:
          name: Build wheel
          command: |
            PYTHON=python<< parameters.python_version >>
-            apt-get update
-            apt-get upgrade -y
-            DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
-            apt-get install -y apt-utils
-            apt-get install -y software-properties-common
-            add-apt-repository -y ppa:deadsnakes/ppa
-            apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
-            apt-get install -y libblas-dev liblapack-dev liblapacke-dev
-            apt-get install -y build-essential git
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            sudo apt-get update
+            TZ=Etc/UTC sudo apt-get -y install tzdata
+            sudo add-apt-repository -y ppa:deadsnakes/ppa
+            sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
            $PYTHON -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
-            pip install --upgrade setuptools
-            pip install numpy
            pip install auditwheel
            pip install patchelf
            pip install build
            pip install twine
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              pip install . -v
+            << parameters.build_env >> pip install ".[dev]" -v
            pip install typing_extensions
-            python setup.py generate_stubs 
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python -m build --wheel
-            auditwheel show dist/*
-            auditwheel repair dist/* --plat manylinux_2_31_x86_64
+            python setup.py generate_stubs
+            python setup.py clean --all
+            MLX_BUILD_STAGE=1 << parameters.build_env >> python -m build -w
+            bash python/scripts/repair_linux.sh
+      - when:
+          condition:
+            equal: ["3.10", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  source env/bin/activate
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 \
+                    python -m build -w
+                  auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload packages
+                command: |
+                  source env/bin/activate
+                  twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+
+  build_cuda_release:
+    parameters:
+      build_env:
+        type: string
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: xlarge
+    steps:
+      - checkout
      - run:
-          name: Upload package
+          name: Build wheel
          command: |
-            source env/bin/activate
-            twine upload wheelhouse/*
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo apt-get update
+            sudo apt-get install cuda-toolkit-12-9 libcudnn9-dev-cuda-12
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install zip
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
+            export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+            << parameters.build_env >> MLX_BUILD_STAGE=2 \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              python -m build -w
+            bash python/scripts/repair_cuda.sh
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  twine upload wheelhouse/*.whl
      - store_artifacts:
          path: wheelhouse/

@@ -340,21 +457,23 @@ workflows:
            pattern: "^(?!pull/)[-\\w]+$"
            value: << pipeline.git.branch >>
        - not: << pipeline.parameters.nightly_build >>
-        - not: << pipeline.parameters.weekly_build >>
        - not: << pipeline.parameters.test_release >>
    jobs:
      - mac_build_and_test:
          matrix:
            parameters:
-              macosx_deployment_target: ["13.5", "14.0"]
+              macosx_deployment_target: ["13.5", "15.0"]
      - linux_build_and_test
+      - cuda_build_and_test:
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
      - build_documentation 

  build_pypi_release:
    when:
      and:
        - not: << pipeline.parameters.nightly_build >>
-        - not: << pipeline.parameters.weekly_build >>
        - not: << pipeline.parameters.test_release >>
    jobs:
      - build_release:
@@ -365,71 +484,10 @@ workflows:
              ignore: /.*/
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
              build_env: ["PYPI_RELEASE=1"]
-              xcode_version: ["16.2.0", "15.0.0"]
-            exclude:
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.9"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.10"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.11"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.12"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.13"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "PYPI_RELEASE=1"
+              xcode_version: ["26.0.0"]
      - build_documentation:
          filters:
            tags:
@@ -437,6 +495,25 @@ workflows:
            branches:
              ignore: /.*/
          upload-docs: true
+      - build_linux_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              build_env: ["PYPI_RELEASE=1"]
+      - build_cuda_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              build_env: ["PYPI_RELEASE=1"]

  prb:
    when:
@@ -452,9 +529,14 @@ workflows:
          requires: [ hold ]
          matrix:
            parameters:
-              macosx_deployment_target: ["13.5", "14.0"]
+              macosx_deployment_target: ["13.5", "15.0"]
      - linux_build_and_test:
          requires: [ hold ]
+      - cuda_build_and_test:
+          requires: [ hold ]
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
  nightly_build:
    when:
      and:
@@ -464,137 +546,34 @@ workflows:
      - build_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
-              xcode_version: ["16.2.0", "15.0.0"]
-            exclude:
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.9"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.10"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.11"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.12"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.13"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-  weekly_build:
+              xcode_version: ["26.0.0"]
+      - build_linux_release:
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+      - build_cuda_release
+
+  build_dev_release:
    when:
      and:
        - equal: [ main, << pipeline.git.branch >> ]
-        - << pipeline.parameters.weekly_build >>
+        - << pipeline.parameters.test_release >>
    jobs:
      - build_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
              build_env: ["DEV_RELEASE=1"]
-              xcode_version: ["16.2.0", "15.0.0"]
-            exclude:
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-  linux_test_release:
-    when:
-      and:
-        - equal: [ main, << pipeline.git.branch >> ]
-        - << pipeline.parameters.linux_release >>
-    jobs:
+              xcode_version: ["26.0.0"]
      - build_linux_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              extra_env: ["PYPI_RELEASE=1"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              build_env: ["DEV_RELEASE=1"]
+      - build_cuda_release:
+          matrix:
+            parameters:
+              build_env: ["DEV_RELEASE=1"]
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+uv.lock

 # vim
 *.swp
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -19,11 +19,17 @@ MLX was developed with contributions from the following individuals:
 - Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.
 - Paul Paczuski: Improved stability of BCE loss calculation
 - Max-Heinrich Laves: Added `conv_transpose1d`, `conv_transpose2d`, and `conv_transpose3d` ops.
+- Gökdeniz Gülmez: Added the `Muon (MomentUm Orthogonalized by Newton-schulz)` optimizer, and the `ReLU²` activation function.

 <a href="https://github.com/ml-explore/mlx/graphs/contributors">
  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
 </a>

+# Organizations
+
+MLX has received contributions from the following companies:
+- NVIDIA Corporation & Affiliates
+
 # Third-Party Software

 MLX leverages several third-party software, listed here together with
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,12 +20,17 @@ project(
  LANGUAGES C CXX
  VERSION ${MLX_PROJECT_VERSION})

+if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+  add_compile_options(-Wall -Wextra)
+endif()
+
 # ----------------------------- Setup -----------------------------
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_INSTALL_MESSAGE NEVER)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

 # ----------------------------- Configuration -----------------------------
 option(MLX_BUILD_TESTS "Build tests for mlx" ON)
@@ -34,13 +39,16 @@ option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
 option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
 option(MLX_BUILD_CPU "Build cpu backend" ON)
+option(MLX_BUILD_CUDA "Build cuda backend" OFF)
 option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
 option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
 option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
 option(MLX_BUILD_BLAS_FROM_SOURCE "Build OpenBLAS from source code" OFF)
 option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
+option(MLX_USE_CCACHE "Use CCache for compilation cache when available" ON)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)
+option(USE_SYSTEM_FMT "Use system's provided fmt library" OFF)

 # --------------------- Processor tests -------------------------
 message(
@@ -63,10 +71,17 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
      message(WARNING "Building for x86_64 arch is not officially supported.")
    endif()
  endif()
-
 else()
  set(MLX_BUILD_METAL OFF)
-  message(WARNING "MLX is prioritised for Apple silicon systems using macOS.")
+endif()
+
+if(MLX_USE_CCACHE)
+  find_program(CCACHE_PROGRAM ccache)
+  if(CCACHE_PROGRAM)
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+  endif()
 endif()

 # ----------------------------- Lib -----------------------------
@@ -77,18 +92,21 @@ cmake_policy(SET CMP0135 NEW)

 add_library(mlx)

-if(MLX_BUILD_METAL)
-  set(METAL_LIB "-framework Metal")
-  set(FOUNDATION_LIB "-framework Foundation")
-  set(QUARTZ_LIB "-framework QuartzCore")
+if(MLX_BUILD_CUDA)
+  enable_language(CUDA)
 endif()

-if(MLX_BUILD_METAL AND NOT METAL_LIB)
-  message(STATUS "Metal not found. Unable to build GPU")
-  set(MLX_BUILD_METAL OFF)
-  set(MLX_METAL_DEBUG OFF)
-elseif(MLX_BUILD_METAL)
-  message(STATUS "Building METAL sources")
+if(MLX_BUILD_METAL)
+  find_library(METAL_LIB Metal)
+  find_library(FOUNDATION_LIB Foundation)
+  find_library(QUARTZ_LIB QuartzCore)
+  if(METAL_LIB)
+    message(STATUS "Metal found ${METAL_LIB}")
+  else()
+    message(
+      FATAL_ERROR
+        "Metal not found. Set MLX_BUILD_METAL=OFF to build without GPU")
+  endif()

  if(MLX_METAL_DEBUG)
    add_compile_definitions(MLX_METAL_DEBUG)
@@ -97,7 +115,8 @@ elseif(MLX_BUILD_METAL)
  # Throw an error if xcrun not found
  execute_process(
    COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
-    OUTPUT_VARIABLE MACOS_SDK_VERSION COMMAND_ERROR_IS_FATAL ANY)
+    OUTPUT_VARIABLE MACOS_SDK_VERSION
+    OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)

  if(${MACOS_SDK_VERSION} LESS 14.0)
    message(
@@ -126,6 +145,12 @@ elseif(MLX_BUILD_METAL)
  target_link_libraries(mlx PUBLIC ${METAL_LIB} ${FOUNDATION_LIB} ${QUARTZ_LIB})
 endif()

+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  # With newer clang/gcc versions following libs are implicitly linked, but when
+  # building on old distributions they need to be explicitly listed.
+  target_link_libraries(mlx PRIVATE dl pthread)
+endif()
+
 if(WIN32)
  if(MSVC)
    # GGUF does not build with MSVC.
@@ -153,7 +178,7 @@ if(MLX_BUILD_CPU)
    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
    set(MLX_BUILD_ACCELERATE ON)
  else()
-    message(STATUS "Accelerate or arm neon not found, using default backend.")
+    message(STATUS "Accelerate not found, using default backend.")
    set(MLX_BUILD_ACCELERATE OFF)
  endif()

@@ -226,12 +251,19 @@ target_include_directories(
  mlx PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
             $<INSTALL_INTERFACE:include>)

-FetchContent_Declare(
-  fmt
-  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-  GIT_TAG 10.2.1
-  EXCLUDE_FROM_ALL)
-FetchContent_MakeAvailable(fmt)
+# Do not add mlx_EXPORTS define for shared library.
+set_target_properties(mlx PROPERTIES DEFINE_SYMBOL "")
+
+if(USE_SYSTEM_FMT)
+  find_package(fmt REQUIRED)
+else()
+  FetchContent_Declare(
+    fmt
+    GIT_REPOSITORY https://github.com/fmtlib/fmt.git
+    GIT_TAG 10.2.1
+    EXCLUDE_FROM_ALL)
+  FetchContent_MakeAvailable(fmt)
+endif()
 target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:fmt::fmt-header-only>)

 if(MLX_BUILD_PYTHON_BINDINGS)
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,6 @@
 include CMakeLists.txt
+include mlx.pc.in
 recursive-include mlx/ *
+include cmake/*
 include python/src/*
 include python/mlx/py.typed # support type hinting as in PEP-561
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@

 [**Quickstart**](#quickstart) | [**Installation**](#installation) |
 [**Documentation**](https://ml-explore.github.io/mlx/build/html/index.html) |
-[**Examples**](#examples) 
+[**Examples**](#examples)

 [![CircleCI](https://circleci.com/gh/ml-explore/mlx.svg?style=svg)](https://circleci.com/gh/ml-explore/mlx)

@@ -11,37 +11,37 @@ brought to you by Apple machine learning research.

 Some key features of MLX include:

- - **Familiar APIs**: MLX has a Python API that closely follows NumPy.  MLX
+- **Familiar APIs**: MLX has a Python API that closely follows NumPy. MLX
   also has fully featured C++, [C](https://github.com/ml-explore/mlx-c), and
   [Swift](https://github.com/ml-explore/mlx-swift/) APIs, which closely mirror
-   the Python API.  MLX has higher-level packages like `mlx.nn` and
+   the Python API. MLX has higher-level packages like `mlx.nn` and
   `mlx.optimizers` with APIs that closely follow PyTorch to simplify building
   more complex models.

- - **Composable function transformations**: MLX supports composable function
-   transformations for automatic differentiation, automatic vectorization,
-   and computation graph optimization.
+- **Composable function transformations**: MLX supports composable function
+  transformations for automatic differentiation, automatic vectorization,
+  and computation graph optimization.

- - **Lazy computation**: Computations in MLX are lazy. Arrays are only
-   materialized when needed.
+- **Lazy computation**: Computations in MLX are lazy. Arrays are only
+  materialized when needed.

- - **Dynamic graph construction**: Computation graphs in MLX are constructed
-   dynamically. Changing the shapes of function arguments does not trigger
-   slow compilations, and debugging is simple and intuitive.
+- **Dynamic graph construction**: Computation graphs in MLX are constructed
+  dynamically. Changing the shapes of function arguments does not trigger
+  slow compilations, and debugging is simple and intuitive.

- - **Multi-device**: Operations can run on any of the supported devices
-   (currently the CPU and the GPU).
+- **Multi-device**: Operations can run on any of the supported devices
+  (currently the CPU and the GPU).

- - **Unified memory**: A notable difference from MLX and other frameworks
-   is the *unified memory model*. Arrays in MLX live in shared memory.
-   Operations on MLX arrays can be performed on any of the supported
-   device types without transferring data.
+- **Unified memory**: A notable difference from MLX and other frameworks
+  is the *unified memory model*. Arrays in MLX live in shared memory.
+  Operations on MLX arrays can be performed on any of the supported
+  device types without transferring data.

 MLX is designed by machine learning researchers for machine learning
 researchers. The framework is intended to be user-friendly, but still efficient
 to train and deploy models. The design of the framework itself is also
 conceptually simple. We intend to make it easy for researchers to extend and
-improve MLX with the goal of quickly exploring new ideas. 
+improve MLX with the goal of quickly exploring new ideas.

 The design of MLX is inspired by frameworks like
 [NumPy](https://numpy.org/doc/stable/index.html),
@@ -68,25 +68,30 @@ in the documentation.

 ## Installation

-MLX is available on [PyPI](https://pypi.org/project/mlx/). To install the Python API, run:
+MLX is available on [PyPI](https://pypi.org/project/mlx/). To install MLX on
+macOS, run:

-**With `pip`**:
-
-```
+```bash
 pip install mlx
 ```

-**With `conda`**:
+To install the CUDA backend on Linux, run:

+```bash
+pip install mlx[cuda]
 ```
-conda install -c conda-forge mlx
+
+To install a CPU-only Linux package, run:
+
+```bash
+pip install mlx[cpu]
 ```

 Checkout the
 [documentation](https://ml-explore.github.io/mlx/build/html/install.html#)
 for more information on building the C++ and Python APIs from source.

-## Contributing 
+## Contributing

 Check out the [contribution guidelines](https://github.com/ml-explore/mlx/tree/main/CONTRIBUTING.md) for more information
 on contributing to MLX. See the
@@ -105,7 +110,7 @@ Hannun, Jagrit Digani, Angelos Katharopoulos, and Ronan Collobert. If you find
 MLX useful in your research and wish to cite it, please use the following
 BibTex entry:

-```
+```text
@software{mlx2023,
  author = {Awni Hannun and Jagrit Digani and Angelos Katharopoulos and Ronan Collobert},
  title = {{MLX}: Efficient and flexible machine learning on Apple silicon},
--- a/benchmarks/cpp/irregular_strides.cpp
+++ b/benchmarks/cpp/irregular_strides.cpp
@@ -1,5 +1,6 @@
 // Copyright © 2023 Apple Inc.

+#include <cstring>
 #include <iostream>
 #include <sstream>

--- a/benchmarks/cpp/single_ops.cpp
+++ b/benchmarks/cpp/single_ops.cpp
@@ -192,6 +192,22 @@ void time_reductions() {

  auto argmin_along_1 = [&a]() { return mx::argmin(a, 1, false); };
  TIME(argmin_along_1);
+
+  auto indices = mx::array({1});
+  auto updates = mx::reshape(mx::array({NAN}), {1, 1, 1});
+  std::vector<int> axes{0};
+  auto b = scatter(a, {indices}, updates, axes);
+  mx::eval(b);
+
+  auto max_along_0 = [&b]() { return mx::max(b, 0, false); };
+  TIME(max_along_0);
+  auto max_along_1 = [&b]() { return mx::max(b, 1, false); };
+  TIME(max_along_1);
+
+  auto min_along_0 = [&b]() { return mx::min(b, 0, false); };
+  TIME(min_along_0);
+  auto min_along_1 = [&b]() { return mx::min(b, 1, false); };
+  TIME(min_along_1);
 }

 void time_gather_scatter() {
--- a/benchmarks/python/blas/bench_gemm.py
+++ b/benchmarks/python/blas/bench_gemm.py
@@ -142,9 +142,7 @@ def bench_shape(B, M, N, K, np_dtype, transpose="nn"):
    t_b = (0, 1, 2) if transpose[1] == "n" else (0, 2, 1)

    c_mlx = a_mx.transpose(t_a) @ b_mx.transpose(t_b)
-    c_npy = a_np.transpose(t_a).astype(np.float32) @ b_np.transpose(t_b).astype(
-        np.float32
-    )
+    c_npy = a_np.transpose(t_a).astype(np_dtype) @ b_np.transpose(t_b).astype(np_dtype)

    atol = 1e-5 if np_dtype == np.float32 else 1e-4

@@ -163,7 +161,7 @@ def get_gflop_count(B, M, N, K):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run gemm benchmarks")

-    dtypes = ("float32", "float16")
+    dtypes = ("float32", "float16", "complex64")
    transposes = ("nn", "nt", "tn")
    shapes = (
        (16, 234, 768, 3072),
@@ -187,7 +185,7 @@ if __name__ == "__main__":
                diff = gflops_mx / gflops_pt - 1.0

                print(
-                    f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100. * diff:+5.2f}%"
+                    f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100.0 * diff:+5.2f}%"
                )
                if gflops_pt >= 2.0 * gflops_mx:
                    print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/blas/bench_gemv.py
+++ b/benchmarks/python/blas/bench_gemv.py
@@ -196,7 +196,7 @@ def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose):


 for transpose in (False, True):
-    for dtype in ("float32", "float16"):
+    for dtype in ("float32", "float16", "complex64"):
        fig, axs = plt.subplots(
            len(in_vec_sizes), 2, figsize=(8.5, 11), layout="constrained"
        )
@@ -215,7 +215,7 @@ for transpose in (False, True):
        fig.suptitle(f"{device_name}: {dtype} {op_name}")
        fig.savefig(
            os.path.join(
-                results_dir, f'{device_name.replace(" ", "_")}_{dtype}_{op_name}.pdf'
+                results_dir, f"{device_name.replace(' ', '_')}_{dtype}_{op_name}.pdf"
            )
        )
        plt.close(fig)
--- a/benchmarks/python/comparative/bench_torch.py
+++ b/benchmarks/python/comparative/bench_torch.py
@@ -5,6 +5,7 @@ import os
 import time

 import torch
+import torch.cuda
 import torch.mps


@@ -44,8 +45,10 @@ def bench(f, *args):


 def sync_if_needed(x):
-    if x.device != torch.device("cpu"):
+    if x.device == torch.device("mps"):
        torch.mps.synchronize()
+    elif x.device == torch.device("cuda"):
+        torch.cuda.synchronize()


@torch.no_grad()
@@ -99,6 +102,14 @@ def reduction(op, axis, x):
    sync_if_needed(x)


+@torch.no_grad()
+def sum_and_add(axis, x, y):
+    z = x.sum(axis=axis, keepdims=True)
+    for i in range(50):
+        z = (z + y).sum(axis=axis, keepdims=True)
+    sync_if_needed(x)
+
+
@torch.no_grad()
 def softmax(axis, x):
    ys = []
@@ -340,7 +351,11 @@ if __name__ == "__main__":
        args.axis.pop(0)

    torch.set_num_threads(1)
-    device = "cpu" if args.cpu else "mps"
+    device = "mps"
+    if torch.cuda.is_available():
+        device = "cuda"
+    if args.cpu:
+        device = "cpu"

    types = args.dtype
    if not types:
@@ -460,5 +475,8 @@ if __name__ == "__main__":
    elif args.benchmark == "selu":
        print(bench(selu, x))

+    elif args.benchmark == "sum_and_add":
+        print(bench(sum_and_add, axis, *xs))
+
    else:
        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")
--- a/benchmarks/python/conv_unaligned_bench.py
+++ b/benchmarks/python/conv_unaligned_bench.py
@@ -0,0 +1,107 @@
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_2D
+
+
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+
+    return pt_conv_2D
+
+
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("mps")
+
+    torch.mps.synchronize()
+
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    dtype = "float32"
+    shapes = (
+        (4, 32, 32, 21, 3, 3, 128),
+        (4, 32, 32, 21, 3, 3, 37),
+        (4, 32, 32, 370, 3, 3, 370),
+        (4, 32, 32, 370, 7, 7, 128),
+        (2, 320, 640, 21, 7, 7, 21),
+    )
+    for N, H, W, C, kh, kw, O in shapes:
+        time_mlx, time_torch = bench_shape(
+            N, H, W, C, kh, kw, O, (1, 1), (0, 0), 1, dtype
+        )
+        diff = time_torch / time_mlx - 1.0
+
+        print(
+            f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kh:2d}, {kw:2d}, {C:3d}), {dtype}, {100. * diff:+5.2f}%"
+        )
+        if time_mlx >= 2.0 * time_torch:
+            print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/layer_norm_bench.py
+++ b/benchmarks/python/layer_norm_bench.py
@@ -1,5 +1,7 @@
 # Copyright © 2023-2024 Apple Inc.

+from functools import partial
+
 import mlx.core as mx
 import mlx.nn as nn
 from time_utils import time_fn
@@ -18,51 +20,63 @@ def layer_norm(x, w, b, eps):
    return y


-def time_layer_norm():
+def time_layer_norm(N, dt):
+    L = 1024
    f1 = lambda x, w, b, y: (layer_norm(x, w, b, 1e-5) * y).sum()
    f2 = lambda x, w, b, y: (mx.fast.layer_norm(x, w, b, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0, 1, 2))
    g2 = mx.grad(f2, argnums=(0, 1, 2))

-    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
-    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    w = mx.random.uniform(shape=(N,)).astype(dt)
+    b = mx.random.uniform(shape=(N,)).astype(dt)
+    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
    mx.eval(x, w, b, y)

-    def layer_norm_loop(g, x, w, b):
+    def layer_norm_loop(f, x, w, b):
+        for _ in range(32):
+            x = f(x, w, b)
+        return x
+
+    time_fn(layer_norm_loop, partial(layer_norm, eps=1e-5), x, w, b)
+    time_fn(layer_norm_loop, partial(mx.fast.layer_norm, eps=1e-5), x, w, b)
+
+    def layer_norm_grad_loop(g, x, w, b):
        gx, gw, gb = x, w, b
        for _ in range(32):
            gx, gw, gb = g(gx, gw, gb, y)
        return gx, gw, gb

-    time_fn(layer_norm_loop, g1, x, w, b)
-    time_fn(layer_norm_loop, g2, x, w, b)
-    time_fn(layer_norm_loop, mx.compile(g1), x, w, b)
-    time_fn(layer_norm_loop, mx.compile(g2), x, w, b)
+    time_fn(layer_norm_grad_loop, g1, x, w, b)
+    time_fn(layer_norm_grad_loop, g2, x, w, b)
+    time_fn(layer_norm_grad_loop, mx.compile(g1), x, w, b)
+    time_fn(layer_norm_grad_loop, mx.compile(g2), x, w, b)

    f1 = lambda x, y: (layer_norm(x, None, None, 1e-5) * y).sum()
    f2 = lambda x, y: (mx.fast.layer_norm(x, None, None, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0,))
    g2 = mx.grad(f2, argnums=(0,))

-    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
-    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    w = mx.random.uniform(shape=(N,)).astype(dt)
+    b = mx.random.uniform(shape=(N,)).astype(dt)
+    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
    mx.eval(x, w, b, y)

-    def layer_norm_loop(g, x):
+    def layer_norm_grad_x_loop(g, x):
        gx = x
        for _ in range(32):
            gx = g(gx, y)
        return gx

-    time_fn(layer_norm_loop, g1, x)
-    time_fn(layer_norm_loop, g2, x)
-    time_fn(layer_norm_loop, mx.compile(g1), x)
-    time_fn(layer_norm_loop, mx.compile(g2), x)
+    time_fn(layer_norm_grad_x_loop, g1, x)
+    time_fn(layer_norm_grad_x_loop, g2, x)
+    time_fn(layer_norm_grad_x_loop, mx.compile(g1), x)
+    time_fn(layer_norm_grad_x_loop, mx.compile(g2), x)


 if __name__ == "__main__":
-    time_layer_norm()
+    for dt in [mx.float32, mx.float16, mx.bfloat16]:
+        for n in [1024, 2048, 4096, 8192, 8192 + 1024]:
+            print(dt, n)
+            time_layer_norm(n, dt)
--- a/benchmarks/python/single_ops.py
+++ b/benchmarks/python/single_ops.py
@@ -51,6 +51,20 @@ def time_maximum():
    time_fn(mx.maximum, a, b)


+def time_max():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    a[1, 1] = mx.nan
+    mx.eval(a)
+    time_fn(mx.max, a, 0)
+
+
+def time_min():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    a[1, 1] = mx.nan
+    mx.eval(a)
+    time_fn(mx.min, a, 0)
+
+
 def time_negative():
    a = mx.random.uniform(shape=(10000, 1000))
    mx.eval(a)
@@ -108,6 +122,8 @@ if __name__ == "__main__":

    time_add()
    time_matmul()
+    time_min()
+    time_max()
    time_maximum()
    time_exp()
    time_negative()
--- a/cmake/FindNCCL.cmake
+++ b/cmake/FindNCCL.cmake
@@ -0,0 +1,54 @@
+# FindNCCL.cmake This module finds the NVIDIA NCCL library and its include
+# directories.
+
+set(NCCL_ROOT_DIR
+    $ENV{NCCL_ROOT_DIR}
+    CACHE PATH "Folder contains NVIDIA NCCL")
+
+find_path(
+  NCCL_INCLUDE_DIRS
+  NAMES nccl.h
+  HINTS ${NCCL_INCLUDE_DIR} ${NCCL_ROOT_DIR} ${NCCL_ROOT_DIR}/include
+        ${CUDA_TOOLKIT_ROOT_DIR}/include)
+
+if($ENV{USE_STATIC_NCCL})
+  message(
+    STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
+  set(NCCL_LIBNAME "libnccl_static.a")
+else()
+  set(NCCL_LIBNAME "nccl")
+endif()
+
+find_library(
+  NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS ${NCCL_LIB_DIR}
+        ${NCCL_ROOT_DIR}
+        ${NCCL_ROOT_DIR}/lib
+        ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
+        ${NCCL_ROOT_DIR}/lib64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS
+                                  NCCL_LIBRARIES)
+
+if(NCCL_FOUND)
+  set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+  message(
+    STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}")
+  file(
+    STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
+    REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$"
+    LIMIT_COUNT 1)
+  if(NCCL_MAJOR_VERSION_DEFINED)
+    string(REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
+                         NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
+    message(STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}")
+  endif()
+  message(
+    STATUS
+      "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+endif()
--- a/cmake/extension.cmake
+++ b/cmake/extension.cmake
@@ -11,13 +11,14 @@ include(CMakeParseArguments)
 # Args: TARGET: Custom target to be added for the metal library TITLE: Name of
 # the .metallib OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib SOURCES: List
 # of source files INCLUDE_DIRS: List of include dirs DEPS: List of dependency
-# files (like headers)
+# files (like headers) DEBUG: Boolean, if true, enables debug compile options
+# for this specific library. If not provided, uses global MLX_METAL_DEBUG.
 #
 # clang format on

 macro(mlx_build_metallib)
  # Parse args
-  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY)
+  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY DEBUG)
  set(multiValueArgs SOURCES INCLUDE_DIRS DEPS)
  cmake_parse_arguments(MTLLIB "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -26,6 +27,10 @@ macro(mlx_build_metallib)

  # Collect compile options
  set(MTLLIB_COMPILE_OPTIONS -Wall -Wextra -fno-fast-math -Wno-c++17-extensions)
+  if(MLX_METAL_DEBUG OR MTLLIB_DEBUG)
+    set(MTLLIB_COMPILE_OPTIONS ${MTLLIB_COMPILE_OPTIONS} -gline-tables-only
+                               -frecord-sources)
+  endif()

  # Prepare metallib build command
  add_custom_command(
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,4 +1,5 @@
 sphinx
 breathe
 sphinx-book-theme
+sphinx-copybutton
 mlx
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -10,7 +10,7 @@ import mlx.core as mx
 # -- Project information -----------------------------------------------------

 project = "MLX"
-copyright = "2023, MLX Contributors"
+copyright = "2023, Apple"
 author = "MLX Contributors"
 version = ".".join(mx.__version__.split(".")[:3])
 release = version
@@ -18,6 +18,7 @@ release = version
 # -- General configuration ---------------------------------------------------

 extensions = [
+    "sphinx_copybutton",
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "sphinx.ext.intersphinx",
--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@@ -8,23 +8,26 @@ MLX supports writing custom Metal kernels through the Python and C++ APIs.
 Simple Example
 --------------

+.. currentmodule:: mlx.core
+
 Let's write a custom kernel that computes ``exp`` elementwise:

 .. code-block:: python

-  def exp_elementwise(a: mx.array):
-      source = """
-          uint elem = thread_position_in_grid.x;
-          T tmp = inp[elem];
-          out[elem] = metal::exp(tmp);
-      """
+  source = """
+      uint elem = thread_position_in_grid.x;
+      T tmp = inp[elem];
+      out[elem] = metal::exp(tmp);
+  """

-      kernel = mx.fast.metal_kernel(
-          name="myexp",
-          input_names=["inp"],
-          output_names=["out"],
-          source=source,
-      )
+  kernel = mx.fast.metal_kernel(
+      name="myexp",
+      input_names=["inp"],
+      output_names=["out"],
+      source=source,
+  )
+
+  def exp_elementwise(a: mx.array):
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@@ -39,8 +42,13 @@ Let's write a custom kernel that computes ``exp`` elementwise:
  b = exp_elementwise(a)
  assert mx.allclose(b, mx.exp(a))

+Every time you make a kernel, a new Metal library is created and possibly
+JIT compiled. To reduce the overhead from that, build the kernel once with
+:func:`fast.metal_kernel` and then use it many times.
+
 .. note::
-    We are only required to pass the body of the Metal kernel in ``source``.
+   Only pass the body of the Metal kernel in ``source``. The function
+   signature is generated automatically.

 The full function signature will be generated using:

@@ -78,44 +86,52 @@ Putting this all together, the generated function signature for ``myexp`` is as

  template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;

-Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads <https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_ function.
-This means we will launch ``mx.prod(grid)`` threads, subdivided into ``threadgroup`` size threadgroups.
-For optimal performance, each thread group dimension should be less than or equal to the corresponding grid dimension.
+Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads
+<https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_
+function. This means we will launch ``mx.prod(grid)`` threads, subdivided into
+``threadgroup`` size threadgroups.  For optimal performance, each thread group
+dimension should be less than or equal to the corresponding grid dimension.

-Passing ``verbose=True`` to ``mx.fast.metal_kernel.__call__`` will print the generated code for debugging purposes.
+Passing ``verbose=True`` to :func:`ast.metal_kernel.__call__` will print the
+generated code for debugging purposes.

 Using Shape/Strides
 -------------------

-``mx.fast.metal_kernel`` supports an argument ``ensure_row_contiguous`` which is ``True`` by default.
-This will copy the ``mx.array`` inputs if needed before the kernel is launched to ensure that the memory layout is row contiguous.
-Generally this makes writing the kernel easier, since we don't have to worry about gaps or the ordering of the dims
-when indexing.
+:func:`fast.metal_kernel` supports an argument ``ensure_row_contiguous`` which
+is ``True`` by default. This will copy the array inputs if needed
+before the kernel is launched to ensure that the memory layout is row
+contiguous.  Generally this makes writing the kernel easier, since we don't
+have to worry about gaps or the ordering of the dims when indexing.

-If we want to avoid this copy, ``metal_kernel`` automatically passes ``a_shape``, ``a_strides`` and ``a_ndim`` for each
-input array ``a`` if any are present in ``source``.
-We can then use MLX's built in indexing utils to fetch the right elements for each thread.
+If we want to avoid this copy, :func:`fast.metal_kernel` automatically passes
+``a_shape``, ``a_strides`` and ``a_ndim`` for each input array ``a`` if any are
+present in ``source``. We can then use MLX's built in indexing utils to fetch
+the right elements for each thread.

-Let's convert ``myexp`` above to support arbitrarily strided arrays without relying on a copy from ``ensure_row_contiguous``:
+Let's convert ``myexp`` above to support arbitrarily strided arrays without
+relying on a copy from ``ensure_row_contiguous``:

 .. code-block:: python
+   
+  source = """
+      uint elem = thread_position_in_grid.x;
+      // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
+      uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
+      T tmp = inp[loc];
+      // Output arrays are always row contiguous
+      out[elem] = metal::exp(tmp);
+  """
+
+  kernel = mx.fast.metal_kernel(
+      name="myexp_strided",
+      input_names=["inp"],
+      output_names=["out"],
+      source=source,
+      ensure_row_contiguous=False,
+  )

  def exp_elementwise(a: mx.array):
-      source = """
-          uint elem = thread_position_in_grid.x;
-          // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
-          uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
-          T tmp = inp[loc];
-          // Output arrays are always row contiguous
-          out[elem] = metal::exp(tmp);
-      """
-
-      kernel = mx.fast.metal_kernel(
-          name="myexp_strided",
-          input_names=["inp"],
-          output_names=["out"],
-          source=source
-      )
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@@ -123,7 +139,6 @@ Let's convert ``myexp`` above to support arbitrarily strided arrays without rely
          threadgroup=(256, 1, 1),
          output_shapes=[a.shape],
          output_dtypes=[a.dtype],
-          ensure_row_contiguous=False,
      )
      return outputs[0]

@@ -142,137 +157,139 @@ We'll start with the following MLX implementation using standard ops:

 .. code-block:: python

-    def grid_sample_ref(x, grid):
-        N, H_in, W_in, _ = x.shape
-        ix = ((grid[..., 0] + 1) * W_in - 1) / 2
-        iy = ((grid[..., 1] + 1) * H_in - 1) / 2
+  def grid_sample_ref(x, grid):
+      N, H_in, W_in, _ = x.shape
+      ix = ((grid[..., 0] + 1) * W_in - 1) / 2
+      iy = ((grid[..., 1] + 1) * H_in - 1) / 2

-        ix_nw = mx.floor(ix).astype(mx.int32)
-        iy_nw = mx.floor(iy).astype(mx.int32)
+      ix_nw = mx.floor(ix).astype(mx.int32)
+      iy_nw = mx.floor(iy).astype(mx.int32)

-        ix_ne = ix_nw + 1
-        iy_ne = iy_nw
+      ix_ne = ix_nw + 1
+      iy_ne = iy_nw

-        ix_sw = ix_nw
-        iy_sw = iy_nw + 1
+      ix_sw = ix_nw
+      iy_sw = iy_nw + 1

-        ix_se = ix_nw + 1
-        iy_se = iy_nw + 1
+      ix_se = ix_nw + 1
+      iy_se = iy_nw + 1

-        nw = (ix_se - ix)    * (iy_se - iy)
-        ne = (ix    - ix_sw) * (iy_sw - iy)
-        sw = (ix_ne - ix)    * (iy    - iy_ne)
-        se = (ix    - ix_nw) * (iy    - iy_nw)
+      nw = (ix_se - ix)    * (iy_se - iy)
+      ne = (ix    - ix_sw) * (iy_sw - iy)
+      sw = (ix_ne - ix)    * (iy    - iy_ne)
+      se = (ix    - ix_nw) * (iy    - iy_nw)

-        I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
-        I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
-        I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
-        I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]
+      I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
+      I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
+      I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
+      I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]

-        mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
-        mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
-        mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
-        mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)
+      mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
+      mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
+      mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
+      mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)

-        I_nw *= mask_nw[..., None]
-        I_ne *= mask_ne[..., None]
-        I_sw *= mask_sw[..., None]
-        I_se *= mask_se[..., None]
+      I_nw *= mask_nw[..., None]
+      I_ne *= mask_ne[..., None]
+      I_sw *= mask_sw[..., None]
+      I_se *= mask_se[..., None]

-        output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se
+      output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se

-        return output
+      return output

-Now let's use ``mx.custom_function`` together with ``mx.fast.metal_kernel``
+Now let's use :func:`custom_function` together with :func:`fast.metal_kernel`
 to write a fast GPU kernel for both the forward and backward passes.

 First we'll implement the forward pass as a fused kernel:

 .. code-block:: python

-    @mx.custom_function
-    def grid_sample(x, grid):
+  source = """
+      uint elem = thread_position_in_grid.x;
+      int H = x_shape[1];
+      int W = x_shape[2];
+      int C = x_shape[3];
+      int gH = grid_shape[1];
+      int gW = grid_shape[2];

-        assert x.ndim == 4, "`x` must be 4D."
-        assert grid.ndim == 4, "`grid` must be 4D."
+      int w_stride = C;
+      int h_stride = W * w_stride;
+      int b_stride = H * h_stride;

-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
-        out_shape = (B, gN, gM, C)
+      uint grid_idx = elem / C * 2;
+      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

-        assert D == 2, "Last dim of `grid` must be size 2."
+      int ix_nw = floor(ix);
+      int iy_nw = floor(iy);

-        source = """
-            uint elem = thread_position_in_grid.x;
-            int H = x_shape[1];
-            int W = x_shape[2];
-            int C = x_shape[3];
-            int gH = grid_shape[1];
-            int gW = grid_shape[2];
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;

-            int w_stride = C;
-            int h_stride = W * w_stride;
-            int b_stride = H * h_stride;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;

-            uint grid_idx = elem / C * 2;
-            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;

-            int ix_nw = floor(ix);
-            int iy_nw = floor(iy);
+      T nw = (ix_se - ix)    * (iy_se - iy);
+      T ne = (ix    - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix)    * (iy    - iy_ne);
+      T se = (ix    - ix_nw) * (iy    - iy_nw);

-            int ix_ne = ix_nw + 1;
-            int iy_ne = iy_nw;
+      int batch_idx = elem / C / gH / gW * b_stride;
+      int channel_idx = elem % C;
+      int base_idx = batch_idx + channel_idx;

-            int ix_sw = ix_nw;
-            int iy_sw = iy_nw + 1;
+      T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
+      T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
+      T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
+      T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];

-            int ix_se = ix_nw + 1;
-            int iy_se = iy_nw + 1;
+      I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
+      I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
+      I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
+      I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;

-            T nw = (ix_se - ix)    * (iy_se - iy);
-            T ne = (ix    - ix_sw) * (iy_sw - iy);
-            T sw = (ix_ne - ix)    * (iy    - iy_ne);
-            T se = (ix    - ix_nw) * (iy    - iy_nw);
+      out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
+  """

-            int batch_idx = elem / C / gH / gW * b_stride;
-            int channel_idx = elem % C;
-            int base_idx = batch_idx + channel_idx;
+  kernel = mx.fast.metal_kernel(
+      name="grid_sample",
+      input_names=["x", "grid"],
+      output_names=["out"],
+      source=source,
+  )

-            T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
-            T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
-            T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
-            T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];
+  @mx.custom_function
+  def grid_sample(x, grid):

-            I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
-            I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
-            I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
-            I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;
+      assert x.ndim == 4, "`x` must be 4D."
+      assert grid.ndim == 4, "`grid` must be 4D."

-            out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
-        """
-        kernel = mx.fast.metal_kernel(
-            name="grid_sample",
-            input_names=["x", "grid"],
-            output_names=["out"],
-            source=source,
-        )
-        outputs = kernel(
-            inputs=[x, grid],
-            template=[("T", x.dtype)],
-            output_shapes=[out_shape],
-            output_dtypes=[x.dtype],
-            grid=(np.prod(out_shape), 1, 1),
-            threadgroup=(256, 1, 1),
-        )
-        return outputs[0]
+      B, _, _, C = x.shape
+      _, gN, gM, D = grid.shape
+      out_shape = (B, gN, gM, C)
+
+      assert D == 2, "Last dim of `grid` must be size 2."
+
+      outputs = kernel(
+          inputs=[x, grid],
+          template=[("T", x.dtype)],
+          output_shapes=[out_shape],
+          output_dtypes=[x.dtype],
+          grid=(np.prod(out_shape), 1, 1),
+          threadgroup=(256, 1, 1),
+      )
+      return outputs[0]

 For a reasonably sized input such as:

 .. code-block:: python

-    x.shape = (8, 1024, 1024, 64)
-    grid.shape = (8, 256, 256, 2)
+  x.shape = (8, 1024, 1024, 64)
+  grid.shape = (8, 256, 256, 2)

 On an M1 Max, we see a big performance improvement:

@@ -281,11 +298,11 @@ On an M1 Max, we see a big performance improvement:
 Grid Sample VJP
 ---------------

-Since we decorated ``grid_sample`` with ``mx.custom_function``, we can now define
-its custom vjp transform so MLX can differentiate it.
+Since we decorated ``grid_sample`` with :func:`custom_function`, we can now
+define its custom vjp transform so MLX can differentiate it.

 The backwards pass requires atomically updating ``x_grad``/``grid_grad`` and so
-requires a few extra ``mx.fast.metal_kernel`` features:
+requires a few extra :func:`fast.metal_kernel` features:

 * ``init_value=0``
    Initialize all of the kernel's outputs to this value before it runs. This allows us to update only part of the output arrays with the kernel.
@@ -299,128 +316,129 @@ We can then implement the backwards pass as follows:

 .. code-block:: python

-    @grid_sample.vjp
-    def grid_sample_vjp(primals, cotangent, _):
-        x, grid = primals
-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
+  source = """
+      uint elem = thread_position_in_grid.x;
+      int H = x_shape[1];
+      int W = x_shape[2];
+      int C = x_shape[3];
+      // Pad C to the nearest larger simdgroup size multiple
+      int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;

-        assert D == 2, "Last dim of `grid` must be size 2."
+      int gH = grid_shape[1];
+      int gW = grid_shape[2];

-        source = """
-            uint elem = thread_position_in_grid.x;
-            int H = x_shape[1];
-            int W = x_shape[2];
-            int C = x_shape[3];
-            // Pad C to the nearest larger simdgroup size multiple
-            int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;
+      int w_stride = C;
+      int h_stride = W * w_stride;
+      int b_stride = H * h_stride;

-            int gH = grid_shape[1];
-            int gW = grid_shape[2];
+      uint grid_idx = elem / C_padded * 2;
+      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

-            int w_stride = C;
-            int h_stride = W * w_stride;
-            int b_stride = H * h_stride;
+      int ix_nw = floor(ix);
+      int iy_nw = floor(iy);

-            uint grid_idx = elem / C_padded * 2;
-            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;

-            int ix_nw = floor(ix);
-            int iy_nw = floor(iy);
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;

-            int ix_ne = ix_nw + 1;
-            int iy_ne = iy_nw;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;

-            int ix_sw = ix_nw;
-            int iy_sw = iy_nw + 1;
+      T nw = (ix_se - ix)    * (iy_se - iy);
+      T ne = (ix    - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix)    * (iy    - iy_ne);
+      T se = (ix    - ix_nw) * (iy    - iy_nw);

-            int ix_se = ix_nw + 1;
-            int iy_se = iy_nw + 1;
+      int batch_idx = elem / C_padded / gH / gW * b_stride;
+      int channel_idx = elem % C_padded;
+      int base_idx = batch_idx + channel_idx;

-            T nw = (ix_se - ix)    * (iy_se - iy);
-            T ne = (ix    - ix_sw) * (iy_sw - iy);
-            T sw = (ix_ne - ix)    * (iy    - iy_ne);
-            T se = (ix    - ix_nw) * (iy    - iy_nw);
+      T gix = T(0);
+      T giy = T(0);
+      if (channel_idx < C) {
+          int cot_index = elem / C_padded * C + channel_idx;
+          T cot = cotangent[cot_index];
+          if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
+              int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);

-            int batch_idx = elem / C_padded / gH / gW * b_stride;
-            int channel_idx = elem % C_padded;
-            int base_idx = batch_idx + channel_idx;
+              T I_nw = x[offset];
+              gix -= I_nw * (iy_se - iy) * cot;
+              giy -= I_nw * (ix_se - ix) * cot;
+          }
+          if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
+              int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);

-            T gix = T(0);
-            T giy = T(0);
-            if (channel_idx < C) {
-                int cot_index = elem / C_padded * C + channel_idx;
-                T cot = cotangent[cot_index];
-                if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
-                    int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);
+              T I_ne = x[offset];
+              gix += I_ne * (iy_sw - iy) * cot;
+              giy -= I_ne * (ix - ix_sw) * cot;
+          }
+          if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
+              int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);

-                    T I_nw = x[offset];
-                    gix -= I_nw * (iy_se - iy) * cot;
-                    giy -= I_nw * (ix_se - ix) * cot;
-                }
-                if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
-                    int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);
+              T I_sw = x[offset];
+              gix -= I_sw * (iy - iy_ne) * cot;
+              giy += I_sw * (ix_ne - ix) * cot;
+          }
+          if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
+              int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);

-                    T I_ne = x[offset];
-                    gix += I_ne * (iy_sw - iy) * cot;
-                    giy -= I_ne * (ix - ix_sw) * cot;
-                }
-                if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
-                    int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);
+              T I_se = x[offset];
+              gix += I_se * (iy - iy_nw) * cot;
+              giy += I_se * (ix - ix_nw) * cot;
+          }
+      }

-                    T I_sw = x[offset];
-                    gix -= I_sw * (iy - iy_ne) * cot;
-                    giy += I_sw * (ix_ne - ix) * cot;
-                }
-                if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
-                    int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);
+      T gix_mult = W / 2;
+      T giy_mult = H / 2;

-                    T I_se = x[offset];
-                    gix += I_se * (iy - iy_nw) * cot;
-                    giy += I_se * (ix - ix_nw) * cot;
-                }
-            }
+      // Reduce across each simdgroup first.
+      // This is much faster than relying purely on atomics.
+      gix = simd_sum(gix);
+      giy = simd_sum(giy);

-            T gix_mult = W / 2;
-            T giy_mult = H / 2;
+      if (thread_index_in_simdgroup == 0) {
+          atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
+          atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
+      }
+  """
+  kernel = mx.fast.metal_kernel(
+      name="grid_sample_grad",
+      input_names=["x", "grid", "cotangent"],
+      output_names=["x_grad", "grid_grad"],
+      source=source,
+      atomic_outputs=True,
+  )

-            // Reduce across each simdgroup first.
-            // This is much faster than relying purely on atomics.
-            gix = simd_sum(gix);
-            giy = simd_sum(giy);
+  @grid_sample.vjp
+  def grid_sample_vjp(primals, cotangent, _):
+      x, grid = primals
+      B, _, _, C = x.shape
+      _, gN, gM, D = grid.shape

-            if (thread_index_in_simdgroup == 0) {
-                atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
-                atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
-            }
-        """
-        kernel = mx.fast.metal_kernel(
-            name="grid_sample_grad",
-            input_names=["x", "grid", "cotangent"],
-            output_names=["x_grad", "grid_grad"],
-            source=source,
-            atomic_outputs=True,
-        )
-        # pad the output channels to simd group size
-        # so that our `simd_sum`s don't overlap.
-        simdgroup_size = 32
-        C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
-        grid_size = B * gN * gM * C_padded
-        outputs = kernel(
-            inputs=[x, grid, cotangent],
-            template=[("T", x.dtype)],
-            output_shapes=[x.shape, grid.shape],
-            output_dtypes=[x.dtype, x.dtype],
-            grid=(grid_size, 1, 1),
-            threadgroup=(256, 1, 1),
-            init_value=0,
-        )
-        return outputs[0], outputs[1]
+      assert D == 2, "Last dim of `grid` must be size 2."
+
+      # pad the output channels to simd group size
+      # so that our `simd_sum`s don't overlap.
+      simdgroup_size = 32
+      C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
+      grid_size = B * gN * gM * C_padded
+      outputs = kernel(
+          inputs=[x, grid, cotangent],
+          template=[("T", x.dtype)],
+          output_shapes=[x.shape, grid.shape],
+          output_dtypes=[x.dtype, x.dtype],
+          grid=(grid_size, 1, 1),
+          threadgroup=(256, 1, 1),
+          init_value=0,
+      )
+      return outputs[0], outputs[1]

 There's an even larger speed up for the vjp:

--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -138,13 +138,13 @@ more concrete:
        * representing the vectorized computation and the axis which
        * corresponds to the output vectorized dimension.
        */
-        virtual std::pair<std::vector<array>, std::vector<int>> vmap(
+        std::pair<std::vector<array>, std::vector<int>> vmap(
            const std::vector<array>& inputs,
            const std::vector<int>& axes) override;

-        /** Print the primitive. */
-        void print(std::ostream& os) override {
-            os << "Axpby";
+        /** The name of primitive. */
+        const char* name() const override {
+          return "Axpby";
        }

        /** Equivalence check **/
@@ -394,14 +394,14 @@ below.
        out.set_data(allocator::malloc(out.nbytes()));

        // Resolve name of kernel
-        std::ostringstream kname;
-        kname << "axpby_" << "general_" << type_to_name(out);
+        std::stream kname;
+        kname = "axpby_general_" + type_to_name(out);

-        // Make sure the metal library is available
-        d.register_library("mlx_ext");
+        // Load the metal library
+        auto lib = d.get_library("mlx_ext", current_binary_dir());

        // Make a kernel from this metal library
-        auto kernel = d.get_kernel(kname.str(), "mlx_ext");
+        auto kernel = d.get_kernel(kname, lib);

        // Prepare to encode kernel
        auto& compute_encoder = d.get_command_encoder(s.index);
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -70,6 +70,7 @@ are the CPU and GPU.
   python/fft
   python/linalg
   python/metal
+   python/cuda
   python/memory_management
   python/nn
   python/optimizers
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -13,22 +13,49 @@ silicon computer is

    pip install mlx

-To install from PyPI you must meet the following requirements:
+To install from PyPI your system must meet the following requirements:

 - Using an M series chip (Apple silicon)
- Using a native Python >= 3.9
+- Using a native Python >= 3.10
 - macOS >= 13.5

 .. note::
    MLX is only available on devices running macOS >= 13.5
    It is highly recommended to use macOS 14 (Sonoma)

+CUDA
+^^^^

-MLX is also available on conda-forge. To install MLX with conda do:
+MLX has a CUDA backend which you can install with:

 .. code-block:: shell

-   conda install conda-forge::mlx
+    pip install mlx[cuda]
+
+To install the CUDA package from PyPi your system must meet the following
+requirements:
+
+- Nvidia architecture >= SM 7.0 (Volta)
+- Nvidia driver >= 550.54.14
+- CUDA toolkit >= 12.0
+- Linux distribution with glibc >= 2.35
+- Python >= 3.10
+
+
+CPU-only (Linux)
+^^^^^^^^^^^^^^^^
+
+For a CPU-only version of MLX that runs on Linux use:
+
+.. code-block:: shell
+
+    pip install mlx[cpu]
+
+To install the CPU-only package from PyPi your system must meet the following
+requirements:
+
+- Linux distribution with glibc >= 2.35
+- Python >= 3.10


 Troubleshooting
@@ -65,6 +92,8 @@ Build Requirements
 Python API
 ^^^^^^^^^^

+.. _python install:
+
 To build and install the MLX python library from source, first, clone MLX from
 `its GitHub repo <https://github.com/ml-explore/mlx>`_:

@@ -76,20 +105,20 @@ Then simply build and install MLX using pip:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install .
+  pip install .

 For developing, install the package with development dependencies, and use an
 editable install:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -e ".[dev]"
+  pip install -e ".[dev]"

 Once the development dependencies are installed, you can build faster with:

 .. code-block:: shell

- CMAKE_BUILD_PARALLEL_LEVEL=8 python setup.py build_ext --inplace
+ python setup.py build_ext --inplace

 Run the tests with:

@@ -107,6 +136,8 @@ IDE:
 C++ API
 ^^^^^^^

+.. _cpp install:
+
 Currently, MLX must be built and installed from source.

 Similarly to the python library, to build and install the MLX C++ library start
@@ -185,6 +216,7 @@ should point to the path to the built metal library.

      xcrun -sdk macosx --show-sdk-version

+
 Binary Size Minimization
 ~~~~~~~~~~~~~~~~~~~~~~~~

@@ -213,6 +245,50 @@ be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
 Metal kernel cache persists across reboots.

+Linux
+^^^^^
+
+To build from source on Linux (CPU only), install the BLAS and LAPACK headers.
+For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   apt-get update -y
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+From here follow the instructions to install either the :ref:`Python <python
+install>` or :ref:`C++ <cpp install>` APIs.
+
+CUDA
+^^^^
+
+To build from source on Linux with CUDA, install the BLAS and LAPACK headers
+and the CUDA toolkit. For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+   dpkg -i cuda-keyring_1.1-1_all.deb
+   apt-get update -y
+   apt-get -y install cuda-toolkit-12-9
+   apt-get install libblas-dev liblapack-dev liblapacke-dev libcudnn9-dev-cuda-12 -y
+
+
+When building either the Python or C++ APIs make sure to pass the cmake flag
+``MLX_BUILD_CUDA=ON``. For example, to build the Python API run:
+
+.. code-block:: shell
+
+  CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"
+
+To build the C++ package run:
+
+.. code-block:: shell
+
+   mkdir -p build && cd build
+   cmake .. -DMLX_BUILD_CUDA=ON && make -j
+
+
 Troubleshooting
 ^^^^^^^^^^^^^^^

--- a/docs/src/python/array.rst
+++ b/docs/src/python/array.rst
@@ -19,6 +19,8 @@ Array
    array.ndim
    array.shape
    array.size
+    array.real
+    array.imag
    array.abs
    array.all
    array.any
--- a/docs/src/python/cuda.rst
+++ b/docs/src/python/cuda.rst
@@ -0,0 +1,9 @@
+CUDA
+=====
+
+.. currentmodule:: mlx.core.cuda
+
+.. autosummary::
+  :toctree: _autosummary
+
+  is_available
--- a/docs/src/python/fast.rst
+++ b/docs/src/python/fast.rst
@@ -13,3 +13,4 @@ Fast
  rope
  scaled_dot_product_attention
  metal_kernel
+  cuda_kernel
--- a/docs/src/python/fft.rst
+++ b/docs/src/python/fft.rst
@@ -20,3 +20,5 @@ FFT
  irfft2
  rfftn
  irfftn
+  fftshift
+  ifftshift
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -16,6 +16,8 @@ Linear Algebra
    cross
    qr
    svd
+    eigvals
+    eig
    eigvalsh
    eigh
    lu
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -27,6 +27,7 @@ simple functions.
   mish
   prelu
   relu
+   relu2
   relu6
   selu
   sigmoid
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -50,6 +50,7 @@ Layers
   QuantizedLinear
   RMSNorm
   ReLU
+   ReLU2
   ReLU6
   RNN
   RoPE
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -112,6 +112,7 @@ Operations
   max
   maximum
   mean
+   median
   meshgrid
   min
   minimum
--- a/docs/src/python/optimizers.rst
+++ b/docs/src/python/optimizers.rst
@@ -51,14 +51,14 @@ the saved state. Here's a simple example:
   optimizer.update(model, grads)

   # Save the state
-   state = tree_flatten(optimizer.state)
-   mx.save_safetensors("optimizer.safetensors", dict(state))
+   state = tree_flatten(optimizer.state, destination={})
+   mx.save_safetensors("optimizer.safetensors", state)

   # Later on, for example when loading from a checkpoint,
   # recreate the optimizer and load the state
   optimizer = optim.Adam(learning_rate=1e-2)

-   state = tree_unflatten(list(mx.load("optimizer.safetensors").items()))
+   state = tree_unflatten(mx.load("optimizer.safetensors"))
   optimizer.state = state

 Note, not every optimizer configuation parameter is saved in the state. For
--- a/docs/src/python/optimizers/common_optimizers.rst
+++ b/docs/src/python/optimizers/common_optimizers.rst
@@ -19,3 +19,4 @@ Common Optimizers
   Adamax
   Lion
   MultiOptimizer
+   Muon
--- a/docs/src/usage/compile.rst
+++ b/docs/src/usage/compile.rst
@@ -130,8 +130,8 @@ Now make an array, and benchmark both functions:
 .. code-block:: python

  x = mx.random.uniform(shape=(32, 1000, 4096))
-  timeit(nn.gelu, x)
-  timeit(mx.compile(nn.gelu), x)
+  timeit(gelu, x)
+  timeit(mx.compile(gelu), x)

 On an M1 Max the times are 15.5 and 3.1 milliseconds. The compiled ``gelu`` is
 five times faster.
@@ -225,7 +225,7 @@ In some cases returning updated state can be pretty inconvenient. Hence,
  def fun(x, y):
      z = x + y
      state.append(z)
-      return mx.exp(z), state
+      return mx.exp(z)

  fun(mx.array(1.0), mx.array(2.0))
  # Prints [array(3, dtype=float32)]
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -184,7 +184,7 @@ almost identical to the example above:

    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
-        grads = mlx.nn.average_gradients(grads) # <---- This line was added
+        grads = mx.nn.average_gradients(grads)  # <---- This line was added
        optimizer.update(model, grads)
        return loss

--- a/docs/src/usage/export.rst
+++ b/docs/src/usage/export.rst
@@ -7,17 +7,17 @@ Exporting Functions

 MLX has an API to export and import functions to and from a file. This lets you
 run computations written in one MLX front-end (e.g. Python) in another MLX
-front-end (e.g. C++). 
+front-end (e.g. C++).

 This guide walks through the basics of the MLX export API with some examples.
 To see the full list of functions check-out the :ref:`API documentation
 <export>`.

-Basics of Exporting 
+Basics of Exporting
 -------------------

 Let's start with a simple example:
- 
+
 .. code-block:: python

  def fun(x, y):
@@ -67,7 +67,7 @@ specified as variable positional arguments or as a tuple of arrays:

  x = mx.array(1.0)
  y = mx.array(1.0)
-   
+
  # Both arguments to fun are positional
  mx.export_function("add.mlxfn", fun, x, y)

@@ -133,7 +133,7 @@ parameters are also saved to the ``model.mlxfn`` file.
   For enclosed arrays inside an exported function, be extra careful to ensure
   they are evaluated. The computation graph that gets exported will include
   the computation that produces enclosed inputs.
-  
+
   If the above example was missing ``mx.eval(model.parameters()``, the
   exported function would include the random initialization of the
   :obj:`mlx.nn.Module` parameters.
@@ -150,8 +150,8 @@ parameters, pass them as inputs to the ``call`` wrapper:
     # Set the model's parameters to the input parameters
     model.update(tree_unflatten(list(params.items())))
     return model(x)
- 
-   params = dict(tree_flatten(model.parameters()))
+
+   params = tree_flatten(model.parameters(), destination={})
   mx.export_function("model.mlxfn", call, (mx.zeros(4),), params)


@@ -164,13 +164,13 @@ to export a function which can be used for inputs with variable shapes:

 .. code-block:: python

-  mx.export_function("fun.mlxfn", mx.abs, mx.array(0.0), shapeless=True)
+  mx.export_function("fun.mlxfn", mx.abs, mx.array([0.0]), shapeless=True)
  imported_abs = mx.import_function("fun.mlxfn")

  # Ok
-  out, = imported_abs(mx.array(-1.0))
-  
-  # Also ok 
+  out, = imported_abs(mx.array([-1.0]))
+
+  # Also ok
  out, = imported_abs(mx.array([-1.0, -2.0]))

 With ``shapeless=False`` (which is the default), the second call to
@@ -197,7 +197,7 @@ a single file by creating an exporting context manager with :func:`exporter`:
  def fun(x, y=None):
      constant = mx.array(3.0)
      if y is not None:
-        x += y 
+        x += y
      return x + constant

  with mx.exporter("fun.mlxfn", fun) as exporter:
@@ -215,7 +215,7 @@ a single file by creating an exporting context manager with :func:`exporter`:
  print(out)

 In the above example the function constant data, (i.e. ``constant``), is only
-saved once. 
+saved once.

 Transformations with Imported Functions
 ---------------------------------------
@@ -238,7 +238,7 @@ on imported functions just like regular Python functions:
  # Prints: array(1, dtype=float32)
  print(dfdx(x))

-  # Compile the imported function 
+  # Compile the imported function
  mx.compile(imported_fun)
  # Prints: array(0, dtype=float32)
  print(compiled_fun(x)[0])
@@ -275,7 +275,7 @@ Import and run the function in C++ with only a few lines of code:
  // Prints: array(2, dtype=float32)
  std::cout << outputs[0] << std::endl;

-Imported functions can be transformed in C++ just like in Python. Use 
+Imported functions can be transformed in C++ just like in Python. Use
 ``std::vector<mx::array>`` for positional arguments and ``std::map<std::string,
 mx::array>`` for keyword arguments when calling imported functions in C++.

--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -107,6 +107,28 @@ same array:
  >>> a
  array([1, 2, 0], dtype=int32)

+Note that unlike NumPy, slicing an array creates a copy, not a view. So
+mutating it does not mutate the original array:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> b = a[:]
+  >>> b[2] = 0
+  >>> b
+  array([1, 2, 0], dtype=int32)
+  >>> a
+  array([1, 2, 3], dtype=int32)
+
+Also unlike NumPy, updates to the same location are nondeterministic:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> a[[0, 0]] = mx.array([4, 5])
+
+The first element of ``a`` could be ``4`` or ``5``.
+
 Transformations of functions which use in-place updates are allowed and work as
 expected. For example:

--- a/examples/cpp/tutorial.cpp
+++ b/examples/cpp/tutorial.cpp
@@ -14,14 +14,17 @@ void array_basics() {
  // Get the value out of it:
  auto s = x.item<float>();
  assert(s == 1.0);
+  (void)s;

  // Scalars have a size of 1:
-  size_t size = x.size();
+  int64_t size = x.size();
  assert(size == 1);
+  (void)size;

  // Scalars have 0 dimensions:
  int ndim = x.ndim();
  assert(ndim == 0);
+  (void)ndim;

  // The shape should be an empty vector:
  auto shape = x.shape();
@@ -30,6 +33,7 @@ void array_basics() {
  // The datatype should be float32:
  auto dtype = x.dtype();
  assert(dtype == mx::float32);
+  (void)dtype;

  // Specify the dtype when constructing the array:
  x = mx::array(1, mx::int32);
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -1,5 +1,6 @@
 // Copyright © 2023-2025 Apple Inc.

+#include <dlfcn.h>
 #include <iostream>
 #include <sstream>

@@ -16,6 +17,19 @@

 namespace my_ext {

+// A helper function to find the location of the current binary on disk.
+// The Metal library ("mlx_ext.mtllib"), should be in the same directory.
+std::string current_binary_dir() {
+  static std::string binary_dir = []() {
+    Dl_info info;
+    if (!dladdr(reinterpret_cast<void*>(&current_binary_dir), &info)) {
+      throw std::runtime_error("Unable to get current binary dir.");
+    }
+    return std::filesystem::path(info.dli_fname).parent_path().string();
+  }();
+  return binary_dir;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Operation Implementation
 ///////////////////////////////////////////////////////////////////////////////
@@ -167,16 +181,15 @@ void Axpby::eval_gpu(
  }

  // Resolve name of kernel (corresponds to axpby.metal)
-  std::ostringstream kname;
-  kname << "axpby_";
-  kname << (contiguous_kernel ? "contiguous_" : "general_");
-  kname << type_to_name(out);
+  std::string kname = "axpby_";
+  kname += (contiguous_kernel ? "contiguous_" : "general_");
+  kname += type_to_name(out);

-  // Make sure the metal library is available
-  d.register_library("mlx_ext");
+  // Load the metal library
+  auto lib = d.get_library("mlx_ext", current_binary_dir());

  // Make a kernel from this metal library
-  auto kernel = d.get_kernel(kname.str(), "mlx_ext");
+  auto kernel = d.get_kernel(kname, lib);

  // Prepare to encode kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
--- a/examples/extensions/axpby/axpby.h
+++ b/examples/extensions/axpby/axpby.h
@@ -74,9 +74,9 @@ class Axpby : public mx::Primitive {
      const std::vector<mx::array>& inputs,
      const std::vector<int>& axes) override;

-  /** Print the primitive. */
-  void print(std::ostream& os) override {
-    os << "Axpby";
+  /** The name of primitive. */
+  const char* name() const override {
+    return "Axpby";
  }

  /** Equivalence check **/
--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
 cmake>=3.25
 mlx>=0.21.0
-nanobind==2.2.0
+nanobind==2.4.0
--- a/examples/extensions/test.py
+++ b/examples/extensions/test.py
@@ -3,8 +3,10 @@ from mlx_sample_extensions import axpby

 a = mx.ones((3, 4))
 b = mx.ones((3, 4))
-c = axpby(a, b, 4.0, 2.0, stream=mx.cpu)
+c_cpu = axpby(a, b, 4.0, 2.0, stream=mx.cpu)
+c_gpu = axpby(a, b, 4.0, 2.0, stream=mx.gpu)

-print(f"c shape: {c.shape}")
-print(f"c dtype: {c.dtype}")
-print(f"c correct: {mx.all(c == 6.0).item()}")
+print(f"c shape: {c_cpu.shape}")
+print(f"c dtype: {c_cpu.dtype}")
+print(f"c_cpu correct: {mx.all(c_cpu == 6.0).item()}")
+print(f"c_gpu correct: {mx.all(c_gpu == 6.0).item()}")
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -21,7 +21,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h)

 # Define MLX_VERSION only in the version.cpp file.
-add_library(mlx_version STATIC ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp)
+add_library(mlx_version OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp)
 target_compile_definitions(mlx_version PRIVATE MLX_VERSION="${MLX_VERSION}")
 target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:mlx_version>)

@@ -49,5 +49,19 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
 if(MLX_BUILD_METAL)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/metal)
 else()
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_metal)
+  target_sources(mlx
+                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/no_metal.cpp)
+endif()
+
+if(MLX_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda)
+else()
+  target_sources(mlx
+                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda/no_cuda.cpp)
+endif()
+
+if(MLX_BUILD_METAL OR MLX_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/gpu)
+else()
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_gpu)
 endif()
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -44,11 +44,11 @@ std::vector<array> array::make_arrays(
    const std::shared_ptr<Primitive>& primitive,
    const std::vector<array>& inputs) {
  std::vector<array> outputs;
-  for (size_t i = 0; i < shapes.size(); ++i) {
+  for (int i = 0; i < std::ssize(shapes); ++i) {
    outputs.emplace_back(std::move(shapes[i]), dtypes[i], primitive, inputs);
  }
  // For each node in |outputs|, its siblings are the other nodes.
-  for (size_t i = 0; i < outputs.size(); ++i) {
+  for (int i = 0; i < std::ssize(outputs); ++i) {
    auto siblings = outputs;
    siblings.erase(siblings.begin() + i);
    outputs[i].set_siblings(std::move(siblings), i);
@@ -145,8 +145,9 @@ void array::set_data(allocator::Buffer buffer, Deleter d) {
  array_desc_->data_size = size();
  array_desc_->flags.contiguous = true;
  array_desc_->flags.row_contiguous = true;
-  auto max_dim = std::max_element(shape().begin(), shape().end());
-  array_desc_->flags.col_contiguous = size() <= 1 || size() == *max_dim;
+  auto max_dim =
+      static_cast<int64_t>(*std::max_element(shape().begin(), shape().end()));
+  array_desc_->flags.col_contiguous = size() <= 1 || size() == max_dim;
 }

 void array::set_data(
@@ -192,7 +193,7 @@ array::~array() {
  }

  // Break circular reference for non-detached arrays with siblings
-  if (auto n = siblings().size(); n > 0) {
+  if (auto n = std::ssize(siblings()); n > 0) {
    bool do_detach = true;
    // If all siblings have siblings.size() references except
    // the one we are currently destroying (which has siblings.size() + 1)
@@ -241,8 +242,8 @@ array::ArrayDesc::ArrayDesc(
    std::vector<array> inputs)
    : shape(std::move(shape)),
      dtype(dtype),
-      status(Status::unscheduled),
      primitive(std::move(primitive)),
+      status(Status::unscheduled),
      inputs(std::move(inputs)) {
  init();
 }
@@ -274,7 +275,7 @@ array::ArrayDesc::~ArrayDesc() {
    ad.inputs.clear();
    for (auto& [_, a] : input_map) {
      bool is_deletable =
-          (a.array_desc_.use_count() <= a.siblings().size() + 1);
+          (a.array_desc_.use_count() <= std::ssize(a.siblings()) + 1);
      // An array with siblings is deletable only if all of its siblings
      // are deletable
      for (auto& s : a.siblings()) {
@@ -283,7 +284,7 @@ array::ArrayDesc::~ArrayDesc() {
        }
        int is_input = (input_map.find(s.id()) != input_map.end());
        is_deletable &=
-            s.array_desc_.use_count() <= a.siblings().size() + is_input;
+            s.array_desc_.use_count() <= std::ssize(a.siblings()) + is_input;
      }
      if (is_deletable) {
        for_deletion.push_back(std::move(a.array_desc_));
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -10,6 +10,7 @@
 #include "mlx/allocator.h"
 #include "mlx/dtype.h"
 #include "mlx/event.h"
+#include "mlx/small_vector.h"

 namespace mlx::core {

@@ -18,8 +19,8 @@ class Primitive;

 using Deleter = std::function<void(allocator::Buffer)>;
 using ShapeElem = int32_t;
-using Shape = std::vector<ShapeElem>;
-using Strides = std::vector<int64_t>;
+using Shape = SmallVector<ShapeElem>;
+using Strides = SmallVector<int64_t>;

 class array {
  /* An array is really a node in a graph. It contains a shared ArrayDesc
@@ -80,22 +81,22 @@ class array {
  }

  /** The size of the array's datatype in bytes. */
-  size_t itemsize() const {
+  int itemsize() const {
    return size_of(dtype());
  }

  /** The number of elements in the array. */
-  size_t size() const {
+  int64_t size() const {
    return array_desc_->size;
  }

  /** The number of bytes in the array. */
-  size_t nbytes() const {
+  int64_t nbytes() const {
    return size() * itemsize();
  }

  /** The number of dimensions of the array. */
-  size_t ndim() const {
+  int ndim() const {
    return array_desc_->shape.size();
  }

@@ -224,6 +225,10 @@ class array {
    // Not copyable
    Data(const Data& d) = delete;
    Data& operator=(const Data& d) = delete;
+    Data(Data&& o) : buffer(o.buffer), d(o.d) {
+      o.buffer = allocator::Buffer(nullptr);
+      o.d = [](allocator::Buffer) {};
+    }
    ~Data() {
      d(buffer);
    }
@@ -324,7 +329,7 @@ class array {
   * corresponding to ``arr[-1, -1, ...]``) then ``data_size = last - first``.
   * Note, ``data_size`` is in units of ``item_size`` (not bytes).
   **/
-  size_t data_size() const {
+  int64_t data_size() const {
    return array_desc_->data_size;
  }

@@ -335,7 +340,7 @@ class array {
    return array_desc_->data->buffer;
  }

-  size_t buffer_size() const {
+  int64_t buffer_size() const {
    return allocator::allocator().size(buffer());
  }

@@ -356,7 +361,7 @@ class array {
  }

  enum Status {
-    // The ouptut of a computation which has not been scheduled.
+    // The output of a computation which has not been scheduled.
    // For example, the status of `x` in `auto x = a + b`.
    unscheduled,

@@ -525,7 +530,7 @@ array::array(
    Shape shape,
    Dtype dtype /* = TypeToDtype<T>() */)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
-  if (data.size() != size()) {
+  if (std::ssize(data) != size()) {
    throw std::invalid_argument(
        "Data size and provided shape mismatch in array construction.");
  }
--- a/mlx/backend/common/buffer_cache.h
+++ b/mlx/backend/common/buffer_cache.h
@@ -0,0 +1,157 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <cassert>
+#include <functional>
+#include <map>
+
+namespace mlx::core {
+
+template <typename T>
+class BufferCache {
+ public:
+  BufferCache(
+      size_t page_size,
+      std::function<size_t(T*)> get_size,
+      std::function<void(T*)> free)
+      : page_size_(page_size),
+        get_size_(std::move(get_size)),
+        free_(std::move(free)) {}
+
+  ~BufferCache() {
+    clear();
+  }
+
+  BufferCache(const BufferCache&) = delete;
+  BufferCache& operator=(const BufferCache&) = delete;
+
+  T* reuse_from_cache(size_t size) {
+    // Find the closest buffer in pool.
+    auto it = buffer_pool_.lower_bound(size);
+    if (it == buffer_pool_.end() ||
+        it->first >= std::min(2 * size, size + 2 * page_size_)) {
+      return nullptr;
+    }
+
+    // Collect from the cache.
+    T* buf = it->second->buf;
+    pool_size_ -= it->first;
+
+    // Remove from record.
+    remove_from_list(it->second);
+    buffer_pool_.erase(it);
+    return buf;
+  }
+
+  void recycle_to_cache(T* buf) {
+    assert(buf);
+    // Add to cache.
+    BufferHolder* bh = new BufferHolder(buf);
+    add_at_head(bh);
+    size_t size = get_size_(buf);
+    pool_size_ += size;
+    buffer_pool_.emplace(size, bh);
+  }
+
+  int release_cached_buffers(size_t min_bytes_to_free) {
+    if (min_bytes_to_free >= 0.9 * pool_size_) {
+      return clear();
+    } else {
+      int n_release = 0;
+      size_t total_bytes_freed = 0;
+
+      while (tail_ && (total_bytes_freed < min_bytes_to_free)) {
+        // Release buffer.
+        size_t size = get_size_(tail_->buf);
+        total_bytes_freed += size;
+        free_(tail_->buf);
+        n_release++;
+
+        // Remove from record.
+        auto its = buffer_pool_.equal_range(size);
+        auto it = std::find_if(its.first, its.second, [this](const auto& el) {
+          return el.second == tail_;
+        });
+        assert(it != buffer_pool_.end());
+        buffer_pool_.erase(it);
+        remove_from_list(tail_);
+      }
+
+      pool_size_ -= total_bytes_freed;
+      return n_release;
+    }
+  }
+
+  int clear() {
+    int n_release = 0;
+    for (auto& [size, holder] : buffer_pool_) {
+      free_(holder->buf);
+      n_release++;
+      delete holder;
+    }
+    buffer_pool_.clear();
+    pool_size_ = 0;
+    head_ = nullptr;
+    tail_ = nullptr;
+    return n_release;
+  }
+
+  size_t cache_size() const {
+    return pool_size_;
+  }
+
+  size_t page_size() const {
+    return page_size_;
+  }
+
+ private:
+  struct BufferHolder {
+   public:
+    explicit BufferHolder(T* buf_) : buf(buf_) {}
+
+    BufferHolder* prev{nullptr};
+    BufferHolder* next{nullptr};
+    T* buf;
+  };
+
+  void add_at_head(BufferHolder* to_add) {
+    if (!head_) {
+      head_ = to_add;
+      tail_ = to_add;
+    } else {
+      head_->prev = to_add;
+      to_add->next = head_;
+      head_ = to_add;
+    }
+  }
+
+  void remove_from_list(BufferHolder* to_remove) {
+    if (to_remove->prev && to_remove->next) { // if middle
+      to_remove->prev->next = to_remove->next;
+      to_remove->next->prev = to_remove->prev;
+    } else if (to_remove->prev && to_remove == tail_) { // if tail
+      tail_ = to_remove->prev;
+      tail_->next = nullptr;
+    } else if (to_remove == head_ && to_remove->next) { // if head
+      head_ = to_remove->next;
+      head_->prev = nullptr;
+    } else if (to_remove == head_ && to_remove == tail_) { // if only element
+      head_ = nullptr;
+      tail_ = nullptr;
+    }
+
+    delete to_remove;
+  }
+
+  std::multimap<size_t, BufferHolder*> buffer_pool_;
+  BufferHolder* head_{nullptr};
+  BufferHolder* tail_{nullptr};
+  size_t pool_size_{0};
+
+  const size_t page_size_;
+  std::function<size_t(T*)> get_size_;
+  std::function<void(T*)> free_;
+};
+
+} // namespace mlx::core
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -21,8 +21,8 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {

  // Compute the flags given the shape and strides
  bool row_contiguous = true, col_contiguous = true;
-  size_t r = 1, c = 1;
-  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
+  int64_t r = 1, c = 1;
+  for (int i = std::ssize(strides_) - 1, j = 0; i >= 0; i--, j++) {
    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
    r *= shape_[i];
@@ -60,7 +60,8 @@ void CustomTransforms::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
-  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
+  for (int i = 0, j = std::ssize(inputs) - std::ssize(outputs);
+       i < std::ssize(outputs);
       i++, j++) {
    outputs[i].copy_shared_buffer(inputs[j]);
  }
@@ -70,7 +71,7 @@ void Depends::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
-  for (int i = 0; i < outputs.size(); i++) {
+  for (int i = 0; i < std::ssize(outputs); i++) {
    outputs[i].copy_shared_buffer(inputs[i]);
  }
 }
@@ -206,11 +207,11 @@ void Split::eval(

  auto compute_new_flags = [](const auto& shape,
                              const auto& strides,
-                              size_t in_data_size,
+                              int64_t in_data_size,
                              auto flags) {
-    size_t data_size = 1;
-    size_t f_stride = 1;
-    size_t b_stride = 1;
+    int64_t data_size = 1;
+    int64_t f_stride = 1;
+    int64_t b_stride = 1;
    flags.row_contiguous = true;
    flags.col_contiguous = true;
    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
@@ -240,7 +241,7 @@ void Split::eval(

  std::vector<int> indices(1, 0);
  indices.insert(indices.end(), indices_.begin(), indices_.end());
-  for (int i = 0; i < indices.size(); i++) {
+  for (int i = 0; i < std::ssize(indices); i++) {
    size_t offset = indices[i] * in.strides()[axis_];
    auto [new_flags, data_size] = compute_new_flags(
        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
@@ -254,7 +255,7 @@ void Squeeze::eval(const std::vector<array>& inputs, array& out) {
  const auto& in = inputs[0];
  Strides strides;
  for (int i = 0, j = 0; i < in.ndim(); ++i) {
-    if (j < axes_.size() && i == axes_[j]) {
+    if (j < std::ssize(axes_) && i == axes_[j]) {
      j++;
    } else {
      strides.push_back(in.strides(i));
@@ -272,7 +273,7 @@ void Transpose::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  Strides out_strides(out.ndim());
  auto& in = inputs[0];
-  for (int ax = 0; ax < axes_.size(); ++ax) {
+  for (int ax = 0; ax < std::ssize(axes_); ++ax) {
    out_strides[ax] = in.strides()[axes_[ax]];
  }

--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -1,8 +1,7 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/backend/common/compiled.h"
-#include "mlx/graph_utils.h"
-#include "mlx/primitives.h"
+#include "mlx/backend/common/utils.h"
 #include "mlx/utils.h"

 namespace mlx::core {
@@ -15,6 +14,8 @@ void print_constant(std::ostream& os, const array& x) {
      return print_float_constant<float16_t>(os, x);
    case bfloat16:
      return print_float_constant<bfloat16_t>(os, x);
+    case float64:
+      return print_float_constant<double>(os, x);
    case complex64:
      return print_complex_constant<complex64_t>(os, x);
    case int8:
@@ -51,6 +52,8 @@ std::string get_type_string(Dtype d) {
      return "float16_t";
    case bfloat16:
      return "bfloat16_t";
+    case float64:
+      return "double";
    case complex64:
      return "complex64_t";
    case bool_:
@@ -79,55 +82,6 @@ std::string get_type_string(Dtype d) {
  }
 }

-std::string build_lib_name(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs,
-    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids) {
-  NodeNamer namer;
-  std::ostringstream os;
-  std::ostringstream constant_hasher;
-
-  // Fill the input names. This is not really necessary, I just like having A,
-  // B, C, ... as the inputs.
-  for (auto& x : inputs) {
-    namer.get_name(x);
-  }
-
-  // The primitives describing the tape. For unary and binary primitives this
-  // must be enough to describe the full computation.
-  for (auto& a : tape) {
-    // name and type of output
-    os << namer.get_name(a) << kindof(a.dtype()) << a.itemsize();
-    // computation performed
-    a.primitive().print(os);
-    // name of inputs to the function
-    for (auto& inp : a.inputs()) {
-      os << namer.get_name(inp);
-    }
-  }
-  os << "_";
-
-  for (auto& x : inputs) {
-    if (constant_ids.find(x.id()) != constant_ids.end()) {
-      os << "C";
-      print_constant(constant_hasher, x);
-    } else {
-      os << (is_scalar(x) ? "S" : "V");
-    }
-  }
-  os << "_";
-  for (auto& x : inputs) {
-    if (constant_ids.find(x.id()) != constant_ids.end()) {
-      continue;
-    }
-    os << kindof(x.dtype()) << x.itemsize();
-  }
-  os << "_" << std::hash<std::string>{}(constant_hasher.str());
-
-  return os.str();
-}
-
 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
    const Shape& shape) {
@@ -159,15 +113,14 @@ bool compiled_check_contiguity(
 void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::vector<array>& inputs_,
-    const std::unordered_set<uintptr_t>& constant_ids_,
+    const std::function<bool(size_t)>& is_constant,
    bool contiguous) {
  if (contiguous) {
    int o = 0;
    Strides strides;
    size_t data_size;
    array::Flags flags;
-    for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {
+    for (int i = 0; i < std::ssize(inputs) && o < std::ssize(outputs); ++i) {
      auto& in = inputs[i];
      // Conditions for donation
      // - Correct size
@@ -175,8 +128,7 @@ void compiled_allocate_outputs(
      // - Donatable
      // - Not a constant
      if (in.itemsize() == outputs[o].itemsize() && !is_scalar(in) &&
-          in.is_donatable() &&
-          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
+          in.is_donatable() && is_constant(i)) {
        outputs[o++].copy_shared_buffer(in);
      }
      // Get representative input flags to properly set non-donated outputs
@@ -186,7 +138,7 @@ void compiled_allocate_outputs(
        data_size = in.data_size();
      }
    }
-    for (; o < outputs.size(); ++o) {
+    for (; o < std::ssize(outputs); ++o) {
      outputs[o].set_data(
          allocator::malloc(data_size * outputs[o].itemsize()),
          data_size,
@@ -195,7 +147,7 @@ void compiled_allocate_outputs(
    }
  } else {
    int o = 0;
-    for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {
+    for (int i = 0; i < std::ssize(inputs) && o < std::ssize(outputs); ++i) {
      auto& in = inputs[i];
      // Conditions for donation
      // - Row contiguous
@@ -204,16 +156,86 @@ void compiled_allocate_outputs(
      // - Not a constant
      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
-          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
+          is_constant(i)) {
        outputs[o].copy_shared_buffer(
            in, outputs[o].strides(), in.flags(), in.data_size());
        o++;
      }
    }
-    for (; o < outputs.size(); ++o) {
+    for (; o < std::ssize(outputs); ++o) {
      outputs[o].set_data(allocator::malloc(outputs[o].nbytes()));
    }
  }
 }

+std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
+    const std::vector<array>& inputs,
+    const array& out,
+    const std::function<bool(size_t)>& is_constant) {
+  const Shape& shape = out.shape();
+  bool contiguous = compiled_check_contiguity(inputs, shape);
+  if (contiguous) {
+    return {true, shape, {}};
+  }
+
+  std::vector<Strides> strides_vec{out.strides()};
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    // Skip constants.
+    if (is_constant(i)) {
+      continue;
+    }
+
+    // Skip scalar inputs.
+    const auto& x = inputs[i];
+    if (is_scalar(x)) {
+      continue;
+    }
+
+    // Broadcast the inputs to the output shape.
+    Strides xstrides;
+    int j = 0;
+    for (; j < shape.size() - x.ndim(); ++j) {
+      if (shape[j] == 1) {
+        xstrides.push_back(out.strides()[j]);
+      } else {
+        xstrides.push_back(0);
+      }
+    }
+    for (int i = 0; i < x.ndim(); ++i, ++j) {
+      if (x.shape(i) == 1) {
+        if (shape[j] == 1) {
+          xstrides.push_back(out.strides()[j]);
+        } else {
+          xstrides.push_back(0);
+        }
+      } else {
+        xstrides.push_back(x.strides()[i]);
+      }
+    }
+    strides_vec.push_back(std::move(xstrides));
+  }
+
+  auto tup = collapse_contiguous_dims(shape, strides_vec, INT32_MAX);
+  return {false, std::move(std::get<0>(tup)), std::move(std::get<1>(tup))};
+}
+
+bool compiled_use_large_index(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
+    bool contiguous) {
+  if (contiguous) {
+    int64_t max_size = 0;
+    for (const auto& in : inputs) {
+      max_size = std::max(max_size, in.data_size());
+    }
+    return max_size > UINT32_MAX;
+  } else {
+    int64_t max_size = 0;
+    for (const auto& o : outputs) {
+      max_size = std::max(max_size, o.size());
+    }
+    return max_size > UINT32_MAX;
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/compiled.h
+++ b/mlx/backend/common/compiled.h
@@ -1,9 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.
 #pragma once

+#include <functional>
 #include <iomanip>
-#include <sstream>
-#include <unordered_set>

 #include "mlx/array.h"
 #include "mlx/primitives.h"
@@ -14,19 +13,17 @@ inline bool is_static_cast(const Primitive& p) {
  return (typeid(p) == typeid(Broadcast) || typeid(p) == typeid(AsType));
 }

-std::string build_lib_name(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs,
-    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids);
-
 std::string get_type_string(Dtype d);

 template <typename T>
 void print_float_constant(std::ostream& os, const array& x) {
  auto old_precision = os.precision();
-  os << std::setprecision(std::numeric_limits<float>::digits10 + 1)
-     << x.item<T>() << std::setprecision(old_precision);
+  if constexpr (std::is_same_v<T, double>) {
+    os << std::setprecision(std::numeric_limits<double>::digits10 + 1);
+  } else {
+    os << std::setprecision(std::numeric_limits<float>::digits10 + 1);
+  }
+  os << x.item<T>() << std::setprecision(old_precision);
 }

 template <typename T>
@@ -60,8 +57,19 @@ bool compiled_check_contiguity(
 void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::vector<array>& inputs_,
-    const std::unordered_set<uintptr_t>& constant_ids_,
+    const std::function<bool(size_t)>& is_constant,
+    bool contiguous);
+
+// Collapse contiguous dims ignoring scalars and constants.
+std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
+    const std::vector<array>& inputs,
+    const array& out,
+    const std::function<bool(size_t)>& is_constant);
+
+// Return whether the kernel should use large index.
+bool compiled_use_large_index(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
    bool contiguous);

 } // namespace mlx::core
--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@@ -2,7 +2,7 @@

 #pragma once

-#include "mlx/array.h"
+#include "mlx/backend/common/utils.h"

 namespace mlx::core {

@@ -26,7 +26,7 @@ inline bool set_copy_output_data(const array& in, array& out, CopyType ctype) {
  if (ctype == CopyType::Vector) {
    // If the input is donateable, we are doing a vector copy and the types
    // have the same size, then the input buffer can hold the output.
-    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
+    if (is_donatable(in, out)) {
      out.copy_shared_buffer(in);
      return true;
    } else {
--- a/mlx/backend/common/hadamard.h
+++ b/mlx/backend/common/hadamard.h
@@ -99,7 +99,11 @@ inline std::pair<int, int> decompose_hadamard(int n) {
          "[hadamard] Only supports n = m*2^k where m in (1, 12, 20, 28).");
    }
  }
+  if (n > (1 << 26)) {
+    throw std::invalid_argument(
+        "[hadamard] Only supports n = m*2^k where k <= 26");
+  }
  return {n, m};
 }

-} // namespace mlx::core
+} // namespace mlx::core
--- a/mlx/backend/common/load.cpp
+++ b/mlx/backend/common/load.cpp
@@ -27,7 +27,7 @@ void swap_endianness(uint8_t* data_bytes, size_t N) {

 namespace mlx::core {

-void Load::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Load::eval_cpu(const std::vector<array>& /* inputs */, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));
  auto read_task = [out_ptr = out.data<char>(),
                    size = out.size(),
--- a/mlx/backend/common/matmul.h
+++ b/mlx/backend/common/matmul.h
@@ -0,0 +1,67 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/utils.h"
+
+#include <sstream>
+
+namespace mlx::core {
+
+inline std::tuple<Shape, Strides, Strides> collapse_batches(
+    const array& a,
+    const array& b) {
+  if (a.ndim() == 2) {
+    return {Shape{1}, Strides{0}, Strides{0}};
+  }
+
+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
+  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
+  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
+
+  auto [batch_shape, batch_strides] =
+      collapse_contiguous_dims(A_bshape, std::vector{A_bstride, B_bstride});
+
+  auto a_batch_strides = batch_strides[0];
+  auto b_batch_strides = batch_strides[1];
+
+  if (batch_shape.empty()) {
+    batch_shape.push_back(1);
+    a_batch_strides.push_back(0);
+    b_batch_strides.push_back(0);
+  }
+
+  return std::make_tuple(batch_shape, a_batch_strides, b_batch_strides);
+}
+
+inline std::tuple<Shape, Strides, Strides, Strides>
+collapse_batches(const array& a, const array& b, const array& c) {
+  if (a.ndim() == 2) {
+    return {Shape{1}, Strides{0}, Strides{0}, Strides{0}};
+  }
+
+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
+  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
+  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
+  Strides C_bstride{c.strides().begin(), c.strides().end() - 2};
+
+  auto [batch_shape, batch_strides] = collapse_contiguous_dims(
+      A_bshape, std::vector{A_bstride, B_bstride, C_bstride});
+
+  auto A_batch_stride = batch_strides[0];
+  auto B_batch_stride = batch_strides[1];
+  auto C_batch_stride = batch_strides[2];
+
+  if (batch_shape.empty()) {
+    batch_shape.push_back(1);
+    A_batch_stride.push_back(0);
+    B_batch_stride.push_back(0);
+    C_batch_stride.push_back(0);
+  }
+
+  return std::make_tuple(
+      batch_shape, A_batch_stride, B_batch_stride, C_batch_stride);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -5,11 +5,9 @@
 namespace mlx::core {

 std::pair<Shape, Strides> shapes_without_reduction_axes(
-    const array& x,
+    Shape shape,
+    Strides strides,
    const std::vector<int>& axes) {
-  auto shape = x.shape();
-  auto strides = x.strides();
-
  for (int i = axes.size() - 1; i >= 0; i--) {
    int a = axes[i];
    shape.erase(shape.begin() + a);
@@ -19,9 +17,18 @@ std::pair<Shape, Strides> shapes_without_reduction_axes(
  return std::make_pair(shape, strides);
 }

+std::pair<Shape, Strides> shapes_without_reduction_axes(
+    const array& x,
+    const std::vector<int>& axes) {
+  auto shape = x.shape();
+  auto strides = x.strides();
+  return shapes_without_reduction_axes(
+      std::move(shape), std::move(strides), axes);
+}
+
 ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // The data is all there and we are reducing over everything
-  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
+  if (x.size() == x.data_size() && std::ssize(axes) == x.ndim() &&
      x.flags().contiguous) {
    return ContiguousAllReduce;
  }
@@ -31,7 +38,7 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
    // Merge consecutive axes
    Shape shape = {x.shape(axes[0])};
    Strides strides = {x.strides()[axes[0]]};
-    for (int i = 1; i < axes.size(); i++) {
+    for (int i = 1; i < std::ssize(axes); i++) {
      if (axes[i] - 1 == axes[i - 1] && x.shape(axes[i]) > 1) {
        shape.back() *= x.shape(axes[i]);
        strides.back() = x.strides()[axes[i]];
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -51,5 +51,9 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);
 std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes);
+std::pair<Shape, Strides> shapes_without_reduction_axes(
+    Shape shape,
+    Strides strides,
+    const std::vector<int>& axes);

 } // namespace mlx::core
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -24,8 +24,8 @@ std::tuple<int64_t, Strides> prepare_slice(
 void shared_buffer_slice(
    const array& in,
    const Strides& out_strides,
-    size_t data_offset,
-    size_t data_size,
+    int64_t data_offset,
+    int64_t data_size,
    array& out) {
  // Compute row/col contiguity
  auto [no_bsx_size, is_row_contiguous, is_col_contiguous] =
@@ -61,7 +61,7 @@ void slice(
  if (data_end < 0) {
    data_end += in.data_size();
  }
-  size_t data_size = (data_end - data_offset);
+  int64_t data_size = (data_end - data_offset);
  shared_buffer_slice(in, inp_strides, data_offset, data_size, out);
 }

--- a/mlx/backend/common/ternary.h
+++ b/mlx/backend/common/ternary.h
@@ -11,6 +11,8 @@ namespace mlx::core {
 enum class TernaryOpType {
  ScalarScalarScalar,
  VectorVectorVector,
+  VectorVectorScalar,
+  VectorScalarVector,
  General,
 };

@@ -25,6 +27,14 @@ get_ternary_op_type(const array& a, const array& b, const array& c) {
      (a.flags().col_contiguous && b.flags().col_contiguous &&
       c.flags().col_contiguous)) {
    topt = TernaryOpType::VectorVectorVector;
+  } else if (
+      b.data_size() == 1 && a.flags().row_contiguous &&
+      c.flags().row_contiguous) {
+    topt = TernaryOpType::VectorScalarVector;
+  } else if (
+      c.data_size() == 1 && a.flags().row_contiguous &&
+      b.flags().row_contiguous) {
+    topt = TernaryOpType::VectorVectorScalar;
  } else {
    topt = TernaryOpType::General;
  }
@@ -59,6 +69,8 @@ inline void set_ternary_op_output_data(
            b.flags());
      }
      break;
+    case TernaryOpType::VectorVectorScalar:
+    case TernaryOpType::VectorScalarVector:
    case TernaryOpType::General:
      // Try to donate an input which is row_contiguous
      if (!((a.flags().row_contiguous && maybe_donate(a)) ||
--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@@ -0,0 +1,26 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+inline void set_unary_output_data(const array& in, array& out) {
+  if (in.flags().contiguous) {
+    if (is_donatable(in, out)) {
+      out.copy_shared_buffer(in);
+    } else {
+      out.set_data(
+          allocator::malloc(in.data_size() * out.itemsize()),
+          in.data_size(),
+          in.strides(),
+          in.flags());
+    }
+  } else {
+    out.set_data(allocator::malloc(out.nbytes()));
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@@ -1,9 +1,22 @@
 // Copyright © 2023-2024 Apple Inc.

+#include <dlfcn.h>
+
 #include "mlx/backend/common/utils.h"

 namespace mlx::core {

+std::filesystem::path current_binary_dir() {
+  static std::filesystem::path binary_dir = []() {
+    Dl_info info;
+    if (!dladdr(reinterpret_cast<void*>(&current_binary_dir), &info)) {
+      throw std::runtime_error("Unable to get current binary dir.");
+    }
+    return std::filesystem::path(info.dli_fname).parent_path();
+  }();
+  return binary_dir;
+}
+
 std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    const Shape& shape,
    const std::vector<Strides>& strides,
@@ -15,7 +28,7 @@ std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    if (shape[0] != 1) {
      to_collapse.push_back(0);
    }
-    size_t size = shape[0];
+    int64_t size = shape[0];
    for (int i = 1; i < shape.size(); i++) {
      bool contiguous = true;
      size *= shape[i];
@@ -51,7 +64,7 @@ std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
      current_shape *= shape[to_collapse[k]];
    }
    out_shape.push_back(current_shape);
-    for (int j = 0; j < strides.size(); j++) {
+    for (int j = 0; j < std::ssize(strides); j++) {
      const auto& st = strides[j];
      out_strides[j].push_back(st[to_collapse[k - 1]]);
    }
@@ -101,4 +114,118 @@ std::pair<Shape, Strides> collapse_contiguous_dims(
  return collapse_contiguous_dims(a.shape(), a.strides(), size_cap);
 }

+Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 /* = 10 */) {
+  int pows[3] = {0, 0, 0};
+  int sum = 0;
+  while (true) {
+    int presum = sum;
+    // Check all the pows
+    if (dim0 >= (1 << (pows[0] + 1))) {
+      pows[0]++;
+      sum++;
+    }
+    if (sum == 10) {
+      break;
+    }
+    if (dim1 >= (1 << (pows[1] + 1))) {
+      pows[1]++;
+      sum++;
+    }
+    if (sum == 10) {
+      break;
+    }
+    if (dim2 >= (1 << (pows[2] + 1))) {
+      pows[2]++;
+      sum++;
+    }
+    if (sum == presum || sum == pow2) {
+      break;
+    }
+  }
+  return std::make_tuple(1ul << pows[0], 1ul << pows[1], 1ul << pows[2]);
+}
+
+Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides) {
+  // Dims with strides of 0 are ignored as they
+  // correspond to broadcasted dimensions
+  size_t grid_x = 1;
+  size_t grid_y = 1;
+  for (int i = 0; i < shape.size(); ++i) {
+    if (strides[i] == 0) {
+      continue;
+    }
+    if (grid_x * shape[i] < UINT32_MAX) {
+      grid_x *= shape[i];
+    } else {
+      grid_y *= shape[i];
+    }
+  }
+  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
+    throw std::runtime_error("Unable to safely factor shape.");
+  }
+  if (grid_y > grid_x) {
+    std::swap(grid_x, grid_y);
+  }
+  return std::make_tuple(
+      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
+}
+
+Dims get_2d_grid_dims_common(
+    const Shape& shape,
+    const Strides& strides,
+    size_t divisor) {
+  // Compute the 2d grid dimensions such that the total size of the grid is
+  // divided by divisor.
+  size_t grid_x = 1;
+  size_t grid_y = 1;
+  for (int i = 0; i < shape.size(); ++i) {
+    if (strides[i] == 0) {
+      continue;
+    }
+
+    // No need to add this shape we can just remove it from the divisor.
+    if (divisor % shape[i] == 0) {
+      divisor /= shape[i];
+      continue;
+    }
+
+    if (grid_x * shape[i] < UINT32_MAX) {
+      grid_x *= shape[i];
+    } else {
+      grid_y *= shape[i];
+    }
+
+    if (divisor > 1) {
+      if (grid_x % divisor == 0) {
+        grid_x /= divisor;
+        divisor = 1;
+      } else if (grid_y % divisor == 0) {
+        grid_y /= divisor;
+        divisor = 1;
+      }
+    }
+  }
+  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
+    throw std::runtime_error("Unable to safely factor shape.");
+  }
+  if (grid_y > grid_x) {
+    std::swap(grid_x, grid_y);
+  }
+  if (divisor > 1) {
+    grid_x = ((grid_x + divisor - 1) / divisor) * divisor;
+  }
+  return std::make_tuple(
+      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
+}
+
+std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2) {
+  auto [bx, by, bz] = get_block_dims_common(dim0, dim1, dim2);
+  auto gx = (dim0 + bx - 1) / bx;
+  auto gy = (dim1 + by - 1) / by;
+  auto gz = (dim2 + bz - 1) / bz;
+
+  return std::make_pair(
+      std::make_tuple(gx, gy, gz), std::make_tuple(bx, by, bz));
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -2,12 +2,17 @@

 #pragma once

+#include <filesystem>
+#include <tuple>
 #include <vector>

 #include "mlx/array.h"

 namespace mlx::core {

+// Return the directory that contains current shared library.
+std::filesystem::path current_binary_dir();
+
 inline int64_t
 elem_to_loc(int elem, const Shape& shape, const Strides& strides) {
  int64_t loc = 0;
@@ -70,6 +75,31 @@ std::pair<Shape, Strides> collapse_contiguous_dims(
    const array& a,
    int64_t size_cap = std::numeric_limits<int32_t>::max());

+// Compute the thread block dimensions which fit the given
+// input dimensions.
+// - The thread block dimensions will be powers of two
+// - The thread block size will be less than 2^pow2
+using Dims = std::tuple<uint32_t, uint32_t, uint32_t>;
+Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 = 10);
+
+// Computes a 2D grid where each element is < UINT_MAX
+// Assumes:
+// - overall size (product of non-broadcasted dimensions) is < UINT_MAX^2
+// - shape and strides correspond to a contiguous (no holes) but
+//   possibly broadcasted array
+Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides);
+
+// Same as above but we do an implicit division with divisor.
+// Basically, equivalent to factorizing
+//    Prod(s \forall s in shape if strides[s] > 0) / divisor.
+Dims get_2d_grid_dims_common(
+    const Shape& shape,
+    const Strides& strides,
+    size_t divisor);
+
+// Get both the block and a grid of blocks that covers dim0, dim1 and dim2.
+std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2);
+
 struct ContiguousIterator {
  inline void step() {
    int dims = shape_.size();
@@ -132,7 +162,7 @@ struct ContiguousIterator {
 };

 inline auto check_contiguity(const Shape& shape, const Strides& strides) {
-  size_t no_broadcast_data_size = 1;
+  int64_t no_broadcast_data_size = 1;
  int64_t f_stride = 1;
  int64_t b_stride = 1;
  bool is_row_contiguous = true;
@@ -153,7 +183,7 @@ inline auto check_contiguity(const Shape& shape, const Strides& strides) {
 }

 inline bool is_donatable(const array& in, const array& out) {
-  constexpr size_t donation_extra = 16384;
+  constexpr int64_t donation_extra = 16384;

  return in.is_donatable() && in.itemsize() == out.itemsize() &&
      in.buffer_size() <= out.nbytes() + donation_extra;
@@ -165,4 +195,11 @@ void shared_buffer_reshape(
    const array& in,
    const Strides& out_strides,
    array& out);
+
+template <typename T>
+inline SmallVector<T> remove_index(SmallVector<T> vec, size_t index) {
+  vec.erase(std::next(vec.begin(), index));
+  return vec;
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/CMakeLists.txt
+++ b/mlx/backend/cpu/CMakeLists.txt
@@ -40,11 +40,13 @@ add_dependencies(mlx cpu_compiled_preamble)

 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/available.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/eig.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/encoder.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
--- a/mlx/backend/cpu/arange.h
+++ b/mlx/backend/cpu/arange.h
@@ -10,7 +10,7 @@ namespace mlx::core {
 namespace {

 template <typename T>
-void arange(T start, T next, array& out, size_t size, Stream stream) {
+void arange(T start, T next, array& out, int64_t size, Stream stream) {
  auto ptr = out.data<T>();
  auto step_size = next - start;
  auto& encoder = cpu::get_command_encoder(stream);
--- a/mlx/backend/cpu/arg_reduce.cpp
+++ b/mlx/backend/cpu/arg_reduce.cpp
@@ -14,19 +14,17 @@ template <typename InT, typename OpT>
 void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto axis_size = in.shape()[axis];
  auto axis_stride = in.strides()[axis];
-  Strides strides = in.strides();
-  Shape shape = in.shape();
-  strides.erase(strides.begin() + axis);
-  shape.erase(shape.begin() + axis);
+  Strides strides = remove_index(in.strides(), axis);
+  Shape shape = remove_index(in.shape(), axis);
  auto in_ptr = in.data<InT>();
  auto out_ptr = out.data<uint32_t>();

-  for (uint32_t i = 0; i < out.size(); ++i) {
+  for (int64_t i = 0; i < out.size(); ++i) {
    auto loc = elem_to_loc(i, shape, strides);
    auto local_in_ptr = in_ptr + loc;
    uint32_t ind_v = 0;
    InT v = (*local_in_ptr);
-    for (uint32_t j = 0; j < axis_size; ++j, local_in_ptr += axis_stride) {
+    for (int64_t j = 0; j < axis_size; ++j, local_in_ptr += axis_stride) {
      op(j, (*local_in_ptr), &ind_v, &v);
    }
    out_ptr[i] = ind_v;
--- a/mlx/backend/cpu/available.cpp
+++ b/mlx/backend/cpu/available.cpp
@@ -0,0 +1,11 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cpu/available.h"
+
+namespace mlx::core::cpu {
+
+bool is_available() {
+  return true;
+}
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/available.h
+++ b/mlx/backend/cpu/available.h
@@ -0,0 +1,9 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+namespace mlx::core::cpu {
+
+bool is_available();
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/binary.cpp
+++ b/mlx/backend/cpu/binary.cpp
@@ -17,7 +17,12 @@ namespace mlx::core {
 namespace {

 template <typename Op>
-void binary(const array& a, const array& b, array& out, Op op, Stream stream) {
+void binary(
+    const array& a,
+    const array& b,
+    array& out,
+    Op /* op */,
+    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);

@@ -81,7 +86,7 @@ void comparison_op(
    const array& a,
    const array& b,
    array& out,
-    Op op,
+    Op /* op */,
    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);
@@ -146,7 +151,7 @@ void binary_float(
    const array& a,
    const array& b,
    array& out,
-    Op op,
+    Op /* op */,
    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);
@@ -187,7 +192,7 @@ void binary_int(
    const array& a,
    const array& b,
    array& out,
-    Op op,
+    Op /* op */,
    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);
--- a/mlx/backend/cpu/binary_two.h
+++ b/mlx/backend/cpu/binary_two.h
@@ -99,7 +99,7 @@ void binary_op_dispatch_dims(
  ContiguousIterator a_it(shape, a_strides, ndim - 2);
  ContiguousIterator b_it(shape, b_strides, ndim - 2);
  auto stride = out_strides[ndim - 3];
-  for (size_t elem = 0; elem < a.size(); elem += stride) {
+  for (int64_t elem = 0; elem < std::ssize(a); elem += stride) {
    binary_op_dims<T, U, Op, 2>(
        a_ptr + a_it.loc,
        b_ptr + b_it.loc,
@@ -137,21 +137,21 @@ void binary_op(
  if (bopt == BinaryOpType::ScalarScalar) {
    std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
  } else if (bopt == BinaryOpType::ScalarVector) {
-    for (size_t i = 0; i < b.data_size(); ++i) {
+    for (int64_t i = 0; i < b.data_size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
      b_ptr++;
    }
  } else if (bopt == BinaryOpType::VectorScalar) {
-    for (size_t i = 0; i < a.data_size(); ++i) {
+    for (int64_t i = 0; i < a.data_size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
      a_ptr++;
    }
  } else { // VectorVector
-    for (size_t i = 0; i < a.size(); ++i) {
+    for (int64_t i = 0; i < a.size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
--- a/mlx/backend/cpu/cholesky.cpp
+++ b/mlx/backend/cpu/cholesky.cpp
@@ -20,7 +20,7 @@ void cholesky_impl(const array& a, array& factor, bool upper, Stream stream) {

  // The decomposition is computed in place, so just copy the input to the
  // output.
-  copy(
+  copy_cpu(
      a,
      factor,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
@@ -33,8 +33,8 @@ void cholesky_impl(const array& a, array& factor, bool upper, Stream stream) {
                    N = a.shape(-1),
                    size = a.size()]() mutable {
    char uplo = (upper) ? 'L' : 'U';
-    size_t num_matrices = size / (N * N);
-    for (int i = 0; i < num_matrices; i++) {
+    int64_t num_matrices = size / (N * N);
+    for (int64_t i = 0; i < num_matrices; i++) {
      // Compute Cholesky factorization.
      int info;
      potrf<T>(
--- a/mlx/backend/cpu/compiled.cpp
+++ b/mlx/backend/cpu/compiled.cpp
@@ -15,6 +15,7 @@
 #include "mlx/backend/cpu/jit_compiler.h"
 #include "mlx/device.h"
 #include "mlx/graph_utils.h"
+#include "mlx/version.h"

 namespace mlx::core {

@@ -48,7 +49,7 @@ static CompilerCache& cache() {
 // GPU compile is always available if the GPU is available and since we are in
 // this file CPU compile is also available.
 namespace detail {
-bool compile_available_for_device(const Device& device) {
+bool compile_available_for_device(const Device& /* device */) {
  return true;
 }

@@ -94,7 +95,11 @@ void* compile(
    kernel_file_name = kernel_name;
  }

-  auto output_dir = std::filesystem::temp_directory_path();
+  auto output_dir =
+      std::filesystem::temp_directory_path() / "mlx" / version() / "cpu";
+  if (!std::filesystem::exists(output_dir)) {
+    std::filesystem::create_directories(output_dir);
+  }

  std::string shared_lib_name = "lib" + kernel_file_name + ".so";
  auto shared_lib_path = (output_dir / shared_lib_name).string();
@@ -146,18 +151,9 @@ inline void build_kernel(
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids,
+    const std::function<bool(size_t)>& is_constant,
    bool contiguous,
    int ndim) {
-  // All outputs should have the exact same shape and will be row contiguous
-  auto output_shape = outputs[0].shape();
-  auto output_strides = outputs[0].strides();
-
-  // Constants are scalars that are captured by value and cannot change
-  auto is_constant = [&constant_ids](const array& x) {
-    return constant_ids.find(x.id()) != constant_ids.end();
-  };
-
  NodeNamer namer;

 #ifdef _MSC_VER
@@ -166,25 +162,28 @@ inline void build_kernel(
 #endif

  // Start the kernel
-  os << "void " << kernel_name << "(void** args) {" << std::endl;
+  os << "void " << kernel_name
+     << "(int* shape, int64_t** strides, void** args) {" << std::endl;

  // Add the input arguments
  int cnt = 0;
-  for (auto& x : inputs) {
-    auto& xname = namer.get_name(x);
-
+  int strides_index = 1;
+  for (int i = 0; i < std::ssize(inputs); ++i) {
    // Skip constants from the input list
-    if (is_constant(x)) {
+    if (is_constant(i)) {
      continue;
    }

+    const auto& x = inputs[i];
+    auto& xname = namer.get_name(x);
+
    auto tstr = get_type_string(x.dtype());
    os << "  " << tstr << "* " << xname << " = (" << tstr << "*)args[" << cnt++
       << "];" << std::endl;
    // Scalars and contiguous need no strides
    if (!is_scalar(x) && !contiguous) {
-      os << "  const size_t* " << xname << "_strides = (size_t*)args[" << cnt++
-         << "];" << std::endl;
+      os << "  const int64_t* " << xname << "_strides = strides["
+         << strides_index++ << "];" << std::endl;
    }
  }

@@ -194,10 +193,8 @@ inline void build_kernel(
    os << "  " << tstr << "* " << namer.get_name(x) << " = (" << tstr
       << "*)args[" << cnt++ << "];" << std::endl;
  }
-  // Add output strides and shape to extract the indices.
-  if (!contiguous) {
-    os << "  const int* shape = (int*)args[" << cnt++ << "];" << std::endl;
-  } else {
+  // Add output size
+  if (contiguous) {
    os << "  const size_t size = (size_t)args[" << cnt++ << "];" << std::endl;
  }

@@ -211,10 +208,11 @@ inline void build_kernel(
  }

  // Read the inputs in tmps
-  for (auto& x : inputs) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const auto& x = inputs[i];
    auto& xname = namer.get_name(x);

-    if (is_constant(x)) {
+    if (is_constant(i)) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = ";
      print_constant(os, x);
      os << ";" << std::endl;
@@ -238,9 +236,9 @@ inline void build_kernel(
      os << "static_cast<" << get_type_string(x.dtype()) << ">(tmp_"
         << namer.get_name(x.inputs()[0]) << ");" << std::endl;
    } else {
-      x.primitive().print(os);
+      os << x.primitive().name();
      os << "()(";
-      for (int i = 0; i < x.inputs().size() - 1; i++) {
+      for (int i = 0; i < std::ssize(x.inputs()) - 1; i++) {
        os << "tmp_" << namer.get_name(x.inputs()[i]) << ", ";
      }
      os << "tmp_" << namer.get_name(x.inputs().back()) << ");" << std::endl;
@@ -264,8 +262,9 @@ inline void build_kernel(
  } else {
    for (int d = ndim - 1; d >= 0; --d) {
      // Update pointers
-      for (auto& x : inputs) {
-        if (is_constant(x) || is_scalar(x)) {
+      for (size_t i = 0; i < inputs.size(); ++i) {
+        const auto& x = inputs[i];
+        if (is_constant(i) || is_scalar(x)) {
          continue;
        }
        auto& xname = namer.get_name(x);
@@ -287,65 +286,33 @@ inline void build_kernel(
 void Compiled::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
-  if (kernel_lib_.empty()) {
-    kernel_lib_ = build_lib_name(inputs_, outputs_, tape_, constant_ids_);
-  }
-
-  // Figure out which kernel we are using
-  auto& shape = outputs[0].shape();
-  auto contiguous = compiled_check_contiguity(inputs, shape);
  auto& encoder = cpu::get_command_encoder(stream());

-  // Handle all broadcasting and collect function input arguments
+  // Collapse contiguous dims to route to a faster kernel if possible. Also
+  // handle all broadcasting.
+  auto [contiguous, shape, strides] =
+      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);
+
+  // Collect function input arguments.
  std::vector<void*> args;
-  std::vector<std::vector<size_t>> strides;
-  for (int i = 0; i < inputs.size(); i++) {
-    // Skip constants.
-    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (is_constant_(i)) {
      continue;
    }
-    auto& x = inputs[i];
+    const auto& x = inputs[i];
    encoder.set_input_array(x);
    args.push_back((void*)x.data<void>());
-
-    if (contiguous || is_scalar(x)) {
-      continue;
-    }
-
-    // Broadcast the input to the output shape.
-    std::vector<size_t> xstrides;
-    int j = 0;
-    for (; j < shape.size() - x.ndim(); j++) {
-      if (shape[j] == 1) {
-        xstrides.push_back(outputs[0].strides()[j]);
-      } else {
-        xstrides.push_back(0);
-      }
-    }
-    for (int i = 0; i < x.ndim(); i++, j++) {
-      if (x.shape(i) == 1) {
-        if (shape[j] == 1) {
-          xstrides.push_back(outputs[0].strides()[j]);
-        } else {
-          xstrides.push_back(0);
-        }
-      } else {
-        xstrides.push_back(x.strides()[i]);
-      }
-    }
-    strides.push_back(std::move(xstrides));
-    args.push_back(strides.back().data());
  }

  // Get the kernel name from the lib
  int ndim = shape.size();
  auto kernel_name = kernel_lib_ + (contiguous ? "_contiguous" : "_strided_");
  if (!contiguous) {
-    kernel_name += std::to_string(shape.size());
+    kernel_name += std::to_string(ndim);
  }

  // Get the function
-  auto fn_ptr = compile(kernel_name, [&]() {
+  auto fn_ptr = compile(kernel_name, [&, contiguous = contiguous]() {
    std::ostringstream kernel;
    kernel << get_kernel_preamble() << std::endl;
    kernel << "extern \"C\"  {" << std::endl;
@@ -355,7 +322,7 @@ void Compiled::eval_cpu(
        inputs_,
        outputs_,
        tape_,
-        constant_ids_,
+        is_constant_,
        contiguous,
        ndim);
    // Close extern "C"
@@ -363,26 +330,26 @@ void Compiled::eval_cpu(
    return kernel.str();
  });

-  compiled_allocate_outputs(
-      inputs, outputs, inputs_, constant_ids_, contiguous);
+  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);

  for (auto& x : outputs) {
    args.push_back(x.data<void>());
    encoder.set_output_array(x);
  }
-  Shape out_shape;
-  if (!contiguous) {
-    out_shape = outputs[0].shape();
-    args.push_back((void*)out_shape.data());
-  } else {
+  if (contiguous) {
    args.push_back((void*)outputs[0].data_size());
  }
-  auto fun = (void (*)(void**))fn_ptr;
-  encoder.dispatch(
-      [fun,
-       args = std::move(args),
-       strides = std::move(strides),
-       out_shape = std::move(out_shape)]() mutable { fun(args.data()); });
+  auto fun = reinterpret_cast<void (*)(int*, int64_t**, void**)>(fn_ptr);
+  encoder.dispatch([fun,
+                    args = std::move(args),
+                    strides = std::move(strides),
+                    shape = std::move(shape)]() mutable {
+    SmallVector<int64_t*> strides_ptrs;
+    for (auto& s : strides) {
+      strides_ptrs.push_back(s.data());
+    }
+    fun(shape.data(), strides_ptrs.data(), args.data());
+  });
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/conv.cpp
+++ b/mlx/backend/cpu/conv.cpp
--- a/mlx/backend/cpu/copy.cpp
+++ b/mlx/backend/cpu/copy.cpp
@@ -295,7 +295,11 @@ inline void copy_inplace_dispatch(

 } // namespace

-void copy_inplace(const array& src, array& dst, CopyType ctype, Stream stream) {
+void copy_cpu_inplace(
+    const array& src,
+    array& dst,
+    CopyType ctype,
+    Stream stream) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(src);
  encoder.set_output_array(dst);
@@ -305,7 +309,7 @@ void copy_inplace(const array& src, array& dst, CopyType ctype, Stream stream) {
       ctype]() mutable { copy_inplace_dispatch(src, dst, ctype); });
 }

-void copy(const array& src, array& dst, CopyType ctype, Stream stream) {
+void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream) {
  bool donated = set_copy_output_data(src, dst, ctype);
  if (donated && src.dtype() == dst.dtype()) {
    // If the output has the same type as the input then there is nothing to
@@ -315,10 +319,10 @@ void copy(const array& src, array& dst, CopyType ctype, Stream stream) {
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
-  copy_inplace(src, dst, ctype, stream);
+  copy_cpu_inplace(src, dst, ctype, stream);
 }

-void copy_inplace(
+void copy_cpu_inplace(
    const array& src,
    array& dst,
    const Shape& data_shape,
@@ -373,4 +377,10 @@ void copy_inplace(
      });
 }

+array contiguous_copy_cpu(const array& arr, Stream stream) {
+  array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+  copy_cpu(arr, arr_copy, CopyType::General, stream);
+  return arr_copy;
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/copy.h
+++ b/mlx/backend/cpu/copy.h
@@ -10,10 +10,14 @@

 namespace mlx::core {

-void copy(const array& src, array& dst, CopyType ctype, Stream stream);
-void copy_inplace(const array& src, array& dst, CopyType ctype, Stream stream);
+void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream);
+void copy_cpu_inplace(
+    const array& src,
+    array& dst,
+    CopyType ctype,
+    Stream stream);

-void copy_inplace(
+void copy_cpu_inplace(
    const array& src,
    array& dst,
    const Shape& data_shape,
@@ -26,4 +30,7 @@ void copy_inplace(
    const std::optional<array>& dynamic_i_offset = std::nullopt,
    const std::optional<array>& dynamic_o_offset = std::nullopt);

+// Return a contiguous array with same shape that copies the data of |arr|.
+array contiguous_copy_cpu(const array& arr, Stream stream);
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/distributed.cpp
+++ b/mlx/backend/cpu/distributed.cpp
@@ -13,9 +13,7 @@ std::pair<array, bool> ensure_row_contiguous(const array& arr, Stream stream) {
  if (arr.flags().row_contiguous) {
    return {arr, false};
  } else {
-    array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-    copy(arr, arr_copy, CopyType::General, stream);
-    return {arr_copy, true};
+    return {contiguous_copy_cpu(arr, stream), true};
  }
 };

@@ -34,8 +32,7 @@ void AllReduce::eval_cpu(
      }
      return in;
    } else {
-      array arr_copy(in.shape(), in.dtype(), nullptr, {});
-      copy(in, arr_copy, CopyType::General, s);
+      array arr_copy = contiguous_copy_cpu(in, s);
      out.copy_shared_buffer(arr_copy);
      return arr_copy;
    }
@@ -93,6 +90,7 @@ void Recv::eval_cpu(
    std::vector<array>& outputs) {
  assert(inputs.size() == 0);
  assert(outputs.size() == 1);
+  (void)inputs;

  outputs[0].set_data(allocator::malloc(outputs[0].nbytes()));
  distributed::detail::recv(group(), outputs[0], src_, stream());
--- a/mlx/backend/cpu/eig.cpp
+++ b/mlx/backend/cpu/eig.cpp
@@ -0,0 +1,173 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/allocator.h"
+#include "mlx/array.h"
+#include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
+#include "mlx/backend/cpu/lapack.h"
+#include "mlx/linalg.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T>
+void eig_impl(
+    array& a,
+    array& vectors,
+    array& values,
+    bool compute_eigenvectors,
+    Stream stream) {
+  using OT = std::complex<T>;
+  auto a_ptr = a.data<T>();
+  auto eig_ptr = values.data<OT>();
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_output_array(values);
+  OT* vec_ptr = nullptr;
+  if (compute_eigenvectors) {
+    encoder.set_output_array(vectors);
+    vec_ptr = vectors.data<OT>();
+  }
+  encoder.dispatch([a_ptr,
+                    vec_ptr,
+                    eig_ptr,
+                    compute_eigenvectors,
+                    N = vectors.shape(-1),
+                    size = vectors.size()]() mutable {
+    // Work query
+    char jobr = 'N';
+    char jobl = compute_eigenvectors ? 'V' : 'N';
+    int n_vecs_r = 1;
+    int n_vecs_l = compute_eigenvectors ? N : 1;
+    int lwork = -1;
+    int info;
+    {
+      T work;
+      geev<T>(
+          &jobl,
+          &jobr,
+          &N,
+          nullptr,
+          &N,
+          nullptr,
+          nullptr,
+          nullptr,
+          &n_vecs_l,
+          nullptr,
+          &n_vecs_r,
+          &work,
+          &lwork,
+          &info);
+      lwork = static_cast<int>(work);
+    }
+
+    auto eig_tmp_data = array::Data{allocator::malloc(sizeof(T) * N * 2)};
+    auto vec_tmp_data =
+        array::Data{allocator::malloc(vec_ptr ? sizeof(T) * N * N * 2 : 0)};
+    auto eig_tmp = static_cast<T*>(eig_tmp_data.buffer.raw_ptr());
+    auto vec_tmp = static_cast<T*>(vec_tmp_data.buffer.raw_ptr());
+    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
+    for (int64_t i = 0; i < size / (N * N); ++i) {
+      geev<T>(
+          &jobl,
+          &jobr,
+          &N,
+          a_ptr,
+          &N,
+          eig_tmp,
+          eig_tmp + N,
+          vec_tmp,
+          &n_vecs_l,
+          nullptr,
+          &n_vecs_r,
+          static_cast<T*>(work_buf.buffer.raw_ptr()),
+          &lwork,
+          &info);
+      for (int i = 0; i < N; ++i) {
+        eig_ptr[i] = {eig_tmp[i], eig_tmp[N + i]};
+      }
+      if (vec_ptr) {
+        for (int i = 0; i < N; ++i) {
+          if (eig_ptr[i].imag() != 0) {
+            // This vector and the next are a pair
+            for (int j = 0; j < N; ++j) {
+              vec_ptr[i * N + j] = {
+                  vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]};
+              vec_ptr[(i + 1) * N + j] = {
+                  vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]};
+            }
+            i += 1;
+          } else {
+            for (int j = 0; j < N; ++j) {
+              vec_ptr[i * N + j] = {vec_tmp[i * N + j], 0};
+            }
+          }
+        }
+        vec_ptr += N * N;
+      }
+      a_ptr += N * N;
+      eig_ptr += N;
+      if (info != 0) {
+        std::stringstream msg;
+        msg << "[Eig::eval_cpu] Eigenvalue decomposition failed with error code "
+            << info;
+        throw std::runtime_error(msg.str());
+      }
+    }
+  });
+  encoder.add_temporary(a);
+}
+
+} // namespace
+
+void Eig::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  const auto& a = inputs[0];
+  auto& values = outputs[0];
+
+  auto vectors = compute_eigenvectors_
+      ? outputs[1]
+      : array(a.shape(), complex64, nullptr, {});
+
+  auto a_copy = array(a.shape(), a.dtype(), nullptr, {});
+  copy_cpu(
+      a,
+      a_copy,
+      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
+      stream());
+
+  values.set_data(allocator::malloc(values.nbytes()));
+
+  if (compute_eigenvectors_) {
+    // Set the strides and flags so the eigenvectors
+    // are in the columns of the output
+    auto flags = vectors.flags();
+    auto strides = vectors.strides();
+    auto ndim = a.ndim();
+    std::swap(strides[ndim - 1], strides[ndim - 2]);
+
+    if (a.size() > 1) {
+      flags.row_contiguous = false;
+      if (ndim > 2) {
+        flags.col_contiguous = false;
+      } else {
+        flags.col_contiguous = true;
+      }
+    }
+    vectors.set_data(
+        allocator::malloc(vectors.nbytes()), vectors.size(), strides, flags);
+  }
+  switch (a.dtype()) {
+    case float32:
+      eig_impl<float>(a_copy, vectors, values, compute_eigenvectors_, stream());
+      break;
+    default:
+      throw std::runtime_error("[Eig::eval_cpu] only supports float32.");
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/eigh.cpp
+++ b/mlx/backend/cpu/eigh.cpp
@@ -12,6 +12,133 @@ namespace mlx::core {

 namespace {

+template <typename T, class Enable = void>
+struct EighWork {};
+
+template <typename T>
+struct EighWork<
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  using R = T;
+
+  char jobz;
+  char uplo;
+  int N;
+  int lwork;
+  int liwork;
+  int info;
+  std::vector<array::Data> buffers;
+
+  EighWork(char jobz_, char uplo_, int N_)
+      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), liwork(-1) {
+    T work;
+    int iwork;
+    syevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        nullptr,
+        &N,
+        nullptr,
+        &work,
+        &lwork,
+        &iwork,
+        &liwork,
+        &info);
+    lwork = static_cast<int>(work);
+    liwork = iwork;
+    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
+    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
+  }
+
+  void run(T* vectors, T* values) {
+    syevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        vectors,
+        &N,
+        values,
+        static_cast<T*>(buffers[0].buffer.raw_ptr()),
+        &lwork,
+        static_cast<int*>(buffers[1].buffer.raw_ptr()),
+        &liwork,
+        &info);
+  }
+};
+
+template <>
+struct EighWork<std::complex<float>> {
+  using T = std::complex<float>;
+  using R = float;
+
+  char jobz;
+  char uplo;
+  int N;
+  int lwork;
+  int lrwork;
+  int liwork;
+  int info;
+  std::vector<array::Data> buffers;
+
+  EighWork(char jobz_, char uplo_, int N_)
+      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), lrwork(-1), liwork(-1) {
+    T work;
+    R rwork;
+    int iwork;
+    heevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        nullptr,
+        &N,
+        nullptr,
+        &work,
+        &lwork,
+        &rwork,
+        &lrwork,
+        &iwork,
+        &liwork,
+        &info);
+    lwork = static_cast<int>(work.real());
+    lrwork = static_cast<int>(rwork);
+    liwork = iwork;
+    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
+    buffers.emplace_back(allocator::malloc(sizeof(R) * lrwork));
+    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
+  }
+
+  void run(T* vectors, R* values) {
+    heevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        vectors,
+        &N,
+        values,
+        static_cast<T*>(buffers[0].buffer.raw_ptr()),
+        &lwork,
+        static_cast<R*>(buffers[1].buffer.raw_ptr()),
+        &lrwork,
+        static_cast<int*>(buffers[2].buffer.raw_ptr()),
+        &liwork,
+        &info);
+    if (jobz == 'V') {
+      // We have pre-transposed the vectors but we also must conjugate them
+      // when they are complex.
+      //
+      // We could vectorize this but it is so fast in comparison to heevd that
+      // it doesn't really matter.
+      for (int i = 0; i < N; i++) {
+        for (int j = 0; j < N; j++) {
+          *vectors = std::conj(*vectors);
+          vectors++;
+        }
+      }
+    }
+  }
+};
+
 template <typename T>
 void eigh_impl(
    array& vectors,
@@ -19,8 +146,10 @@ void eigh_impl(
    const std::string& uplo,
    bool compute_eigenvectors,
    Stream stream) {
+  using R = typename EighWork<T>::R;
+
  auto vec_ptr = vectors.data<T>();
-  auto eig_ptr = values.data<T>();
+  auto eig_ptr = values.data<R>();
  char jobz = compute_eigenvectors ? 'V' : 'N';

  auto& encoder = cpu::get_command_encoder(stream);
@@ -33,49 +162,17 @@ void eigh_impl(
                    N = vectors.shape(-1),
                    size = vectors.size()]() mutable {
    // Work query
-    int lwork = -1;
-    int liwork = -1;
-    int info;
-    {
-      T work;
-      int iwork;
-      syevd<T>(
-          &jobz,
-          &uplo,
-          &N,
-          nullptr,
-          &N,
-          nullptr,
-          &work,
-          &lwork,
-          &iwork,
-          &liwork,
-          &info);
-      lwork = static_cast<int>(work);
-      liwork = iwork;
-    }
+    EighWork<T> work(jobz, uplo, N);

-    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
-    auto iwork_buf = array::Data{allocator::malloc(sizeof(int) * liwork)};
-    for (size_t i = 0; i < size / (N * N); ++i) {
-      syevd<T>(
-          &jobz,
-          &uplo,
-          &N,
-          vec_ptr,
-          &N,
-          eig_ptr,
-          static_cast<T*>(work_buf.buffer.raw_ptr()),
-          &lwork,
-          static_cast<int*>(iwork_buf.buffer.raw_ptr()),
-          &liwork,
-          &info);
+    // Work loop
+    for (int64_t i = 0; i < size / (N * N); ++i) {
+      work.run(vec_ptr, eig_ptr);
      vec_ptr += N * N;
      eig_ptr += N;
-      if (info != 0) {
+      if (work.info != 0) {
        std::stringstream msg;
        msg << "[Eigh::eval_cpu] Eigenvalue decomposition failed with error code "
-            << info;
+            << work.info;
        throw std::runtime_error(msg.str());
      }
    }
@@ -99,7 +196,7 @@ void Eigh::eval_cpu(

  values.set_data(allocator::malloc(values.nbytes()));

-  copy(
+  copy_cpu(
      a,
      vectors,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
@@ -131,6 +228,10 @@ void Eigh::eval_cpu(
      eigh_impl<double>(
          vectors, values, uplo_, compute_eigenvectors_, stream());
      break;
+    case complex64:
+      eigh_impl<std::complex<float>>(
+          vectors, values, uplo_, compute_eigenvectors_, stream());
+      break;
    default:
      throw std::runtime_error(
          "[Eigh::eval_cpu] only supports float32 or float64.");
--- a/mlx/backend/cpu/encoder.h
+++ b/mlx/backend/cpu/encoder.h
@@ -20,8 +20,8 @@ struct CommandEncoder {
  CommandEncoder(CommandEncoder&&) = delete;
  CommandEncoder& operator=(CommandEncoder&&) = delete;

-  void set_input_array(const array& a) {}
-  void set_output_array(array& a) {}
+  void set_input_array(const array& /* a */) {}
+  void set_output_array(array& /* a */) {}

  // Hold onto a temporary until any already scheduled tasks which use it as
  // an input are complete.
--- a/mlx/backend/cpu/gemm.h
+++ b/mlx/backend/cpu/gemm.h
@@ -12,12 +12,12 @@ void matmul(
    T* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
--- a/mlx/backend/cpu/gemms/bnns.cpp
+++ b/mlx/backend/cpu/gemms/bnns.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2023-2024 Apple Inc.
-
 #include <Accelerate/Accelerate.h>

 #include "mlx/array.h"
@@ -35,7 +34,7 @@ void matmul_bnns(
    bool b_transposed,
    size_t lda,
    size_t ldb,
-    size_t ldc,
+    size_t /* ldc */,
    float alpha,
    float beta,
    size_t batch_size,
@@ -49,9 +48,15 @@ void matmul_bnns(
  size_t K = a_shape[ndim - 1];

  BNNSDataType bnns_dtype = to_bnns_dtype<T>();
-
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+  if (beta != 1.0 && beta != 0.0) {
+    // scale the output
+    for (size_t i = 0; i < batch_size * M * N; ++i) {
+      out[i] *= beta;
+    }
+    beta = 1.0;
+  }
  const BNNSLayerParametersBroadcastMatMul gemm_params{
      /* float alpha = */ alpha,
      /* float beta = */ beta,
@@ -122,7 +127,7 @@ void matmul_bnns(
  auto bnns_filter =
      BNNSFilterCreateLayerBroadcastMatMul(&gemm_params, nullptr);

-  for (int i = 0; i < batch_size; ++i) {
+  for (size_t i = 0; i < batch_size; ++i) {
    BNNSFilterApplyTwoInput(
        bnns_filter,
        reinterpret_cast<const uint8_t*>(
@@ -143,12 +148,12 @@ void matmul<float16_t>(
    float16_t* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
@@ -178,12 +183,12 @@ void matmul<bfloat16_t>(
    bfloat16_t* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
--- a/mlx/backend/cpu/gemms/cblas.cpp
+++ b/mlx/backend/cpu/gemms/cblas.cpp
@@ -13,20 +13,20 @@ void matmul<float>(
    float* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
-  size_t M = a_shape[ndim - 2];
-  size_t N = b_shape[ndim - 1];
-  size_t K = a_shape[ndim - 1];
+  int64_t M = a_shape[ndim - 2];
+  int64_t N = b_shape[ndim - 1];
+  int64_t K = a_shape[ndim - 1];

  for (int i = 0; i < batch_size; ++i) {
    cblas_sgemm(
@@ -54,20 +54,20 @@ void matmul<double>(
    double* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
-  size_t M = a_shape[ndim - 2];
-  size_t N = b_shape[ndim - 1];
-  size_t K = a_shape[ndim - 1];
+  int64_t M = a_shape[ndim - 2];
+  int64_t N = b_shape[ndim - 1];
+  int64_t K = a_shape[ndim - 1];

  for (int i = 0; i < batch_size; ++i) {
    cblas_dgemm(
@@ -88,4 +88,47 @@ void matmul<double>(
  }
 }

+template <>
+void matmul<complex64_t>(
+    const complex64_t* a,
+    const complex64_t* b,
+    complex64_t* out,
+    bool a_transposed,
+    bool b_transposed,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    float alpha,
+    float beta,
+    int64_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  auto ndim = a_shape.size();
+  int64_t M = a_shape[ndim - 2];
+  int64_t N = b_shape[ndim - 1];
+  int64_t K = a_shape[ndim - 1];
+  auto calpha = static_cast<complex64_t>(alpha);
+  auto cbeta = static_cast<complex64_t>(beta);
+
+  for (int i = 0; i < batch_size; ++i) {
+    cblas_cgemm(
+        CblasRowMajor,
+        a_transposed ? CblasTrans : CblasNoTrans, // transA
+        b_transposed ? CblasTrans : CblasNoTrans, // transB
+        M,
+        N,
+        K,
+        &calpha,
+        a + elem_to_loc(M * K * i, a_shape, a_strides),
+        lda,
+        b + elem_to_loc(K * N * i, b_shape, b_strides),
+        ldb,
+        &cbeta,
+        out + M * N * i,
+        ldc);
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/hadamard.cpp
+++ b/mlx/backend/cpu/hadamard.cpp
@@ -11,9 +11,9 @@ namespace mlx::core {

 // n = 2^k component
 template <typename T>
-void hadamard_n(T* out, int n, int m, float scale, size_t size) {
+void hadamard_n(T* out, int n, int /* m */, float scale, int64_t size) {
  for (int b = 0; b < size / n; b++) {
-    size_t loc = b * n;
+    int64_t loc = b * n;
    T* data_ptr = out + loc;
    int h = 1;
    int n_over_2 = n / 2;
@@ -37,7 +37,7 @@ void hadamard_n(T* out, int n, int m, float scale, size_t size) {

 // m component
 template <typename T>
-void hadamard_m(T* out, int n, int m, float scale, size_t size) {
+void hadamard_m(T* out, int n, int m, float scale, int64_t size) {
  auto h_matrices = hadamard_matrices();
  auto& matrix = h_matrices[m];
  auto start = 1;
@@ -45,7 +45,7 @@ void hadamard_m(T* out, int n, int m, float scale, size_t size) {
  std::vector<bool> hmat_vec;
  while (end != std::string_view::npos) {
    auto row = matrix.substr(start, end - start);
-    for (int i = 0; i < row.length(); i++) {
+    for (int i = 0; i < std::ssize(row); i++) {
      hmat_vec.push_back(row[i] == '+');
    }
    start = end + 1;
@@ -53,7 +53,7 @@ void hadamard_m(T* out, int n, int m, float scale, size_t size) {
  }

  for (int b = 0; b < size / m / n; b++) {
-    size_t loc = b * n * m;
+    int64_t loc = b * n * m;
    T* data_ptr = out + loc;
    for (int i = 0; i < n; i++) {
      std::vector<float> out(m);
@@ -96,7 +96,7 @@ void Hadamard::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (in.flags().row_contiguous && in.is_donatable()) {
    out.copy_shared_buffer(in);
  } else {
-    copy(
+    copy_cpu(
        in,
        out,
        in.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cpu/indexing.cpp
+++ b/mlx/backend/cpu/indexing.cpp
@@ -78,7 +78,7 @@ void gather(
    can_copy = true;

    // Ignore leading 1s
-    int i = 0;
+    int64_t i = 0;
    for (; i < slice_sizes.size() && slice_sizes[i] == 1; ++i)
      ;

@@ -91,7 +91,7 @@ void gather(
    can_copy = true;

    // Ignore trailing 1s
-    int i = slice_sizes.size() - 1;
+    int64_t i = slice_sizes.size() - 1;
    for (; i >= 0 && slice_sizes[i] == 1; --i)
      ;

@@ -101,11 +101,11 @@ void gather(
      can_copy = (src.shape(i) == slice_sizes[i]);
    }
  }
-  size_t slice_size = 1;
+  int64_t slice_size = 1;
  for (auto s : slice_sizes) {
    slice_size *= s;
  }
-  size_t ind_size = slice_size == 0 ? 0 : out.size() / slice_size;
+  int64_t ind_size = slice_size == 0 ? 0 : out.size() / slice_size;
  const T* src_ptr = src.data<T>();
  T* dst_ptr = out.data<T>();

@@ -115,10 +115,10 @@ void gather(
    src_it = ContiguousIterator(slice_sizes, src.strides(), src.ndim());
  }

-  size_t out_idx = 0;
-  for (int idx = 0; idx < ind_size; idx++) {
-    size_t src_idx = 0;
-    for (int ii = 0; ii < inds.size(); ++ii) {
+  int64_t out_idx = 0;
+  for (int64_t idx = 0; idx < ind_size; idx++) {
+    int64_t src_idx = 0;
+    for (int ii = 0; ii < std::ssize(inds); ++ii) {
      auto ax = axes[ii];
      auto idx_loc = its[ii].loc;
      its[ii].step();
@@ -134,7 +134,7 @@ void gather(
          src_ptr + src_idx, src_ptr + src_idx + slice_size, dst_ptr + out_idx);
      out_idx += slice_size;
    } else {
-      for (int jj = 0; jj < slice_size; jj++) {
+      for (int64_t jj = 0; jj < slice_size; jj++) {
        dst_ptr[out_idx++] = src_ptr[src_idx + src_it.loc];
        src_it.step();
      }
@@ -257,15 +257,11 @@ void gather_axis(
    const array& ind,
    array& out,
    const int axis) {
-  auto strides = ind.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = ind.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator ind_it(shape, strides, src.ndim() - 1);
-
-  strides = src.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator src_it(shape, strides, src.ndim() - 1);
+  auto shape = remove_index(ind.shape(), axis);
+  ContiguousIterator ind_it(
+      shape, remove_index(ind.strides(), axis), src.ndim() - 1);
+  ContiguousIterator src_it(
+      shape, remove_index(src.strides(), axis), src.ndim() - 1);

  auto ind_ptr = ind.data<IdxT>();
  auto src_ptr = src.data<T>();
@@ -407,11 +403,11 @@ void scatter(
    const std::vector<int>& axes) {
  int nind = inds.size();
  auto inds_ndim = updates.ndim() - out.ndim();
-  size_t n_updates = nind ? inds[0].size() : 1;
+  int64_t n_updates = nind ? inds[0].size() : 1;

  Shape update_shape(
      updates.shape().begin() + inds_ndim, updates.shape().end());
-  size_t update_size = 1;
+  int64_t update_size = 1;
  for (auto us : update_shape) {
    update_size *= us;
  }
@@ -422,9 +418,9 @@ void scatter(

  auto out_ptr = out.data<InT>();
  auto upd_ptr = updates.data<InT>();
-  for (int i = 0; i < n_updates; ++i) {
-    size_t out_offset = 0;
-    for (int j = 0; j < inds.size(); ++j) {
+  for (int64_t i = 0; i < n_updates; ++i) {
+    int64_t out_offset = 0;
+    for (int j = 0; j < std::ssize(inds); ++j) {
      auto ax = axes[j];
      auto idx_loc = its[j].loc;
      its[j].step();
@@ -433,7 +429,7 @@ void scatter(
      out_offset += (idx_val * out.strides()[ax]);
    }
    update_it.seek(i * update_size);
-    for (int j = 0; j < update_size; ++j) {
+    for (int64_t j = 0; j < update_size; ++j) {
      OpT{}(upd_ptr[update_it.loc], out_ptr + out_offset + out_it.loc);
      update_it.step();
      out_it.step();
@@ -521,7 +517,7 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  // Copy src into out (copy allocates memory for out)
  auto ctype =
      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy(src, out, ctype, stream());
+  copy_cpu(src, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  std::vector<array> inds;
@@ -585,15 +581,11 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {

 template <typename T, typename IdxT, typename OpT>
 void scatter_axis(array& out, const array idx, const array& upd, int axis) {
-  auto strides = idx.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = idx.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator idx_it(shape, strides, upd.ndim() - 1);
-
-  strides = upd.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator upd_it(shape, strides, upd.ndim() - 1);
+  auto shape = remove_index(idx.shape(), axis);
+  ContiguousIterator idx_it(
+      shape, remove_index(idx.strides(), axis), upd.ndim() - 1);
+  ContiguousIterator upd_it(
+      shape, remove_index(upd.strides(), axis), upd.ndim() - 1);

  auto idx_ptr = idx.data<IdxT>();
  auto upd_ptr = upd.data<T>();
@@ -694,7 +686,7 @@ void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
  // Copy src into out (copy allocates memory for out)
  auto ctype =
      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy(src, out, ctype, stream());
+  copy_cpu(src, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(idx);
--- a/mlx/backend/cpu/inverse.cpp
+++ b/mlx/backend/cpu/inverse.cpp
@@ -115,14 +115,14 @@ void inverse_impl(
  //   (A⁻¹)ᵀ = (Aᵀ)⁻¹

  // The inverse is computed in place, so just copy the input to the output.
-  copy(
+  copy_cpu(
      a,
      inv,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
      stream);

  const int N = a.shape(-1);
-  const size_t num_matrices = a.size() / (N * N);
+  const int64_t num_matrices = a.size() / (N * N);

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_output_array(inv);
@@ -130,13 +130,13 @@ void inverse_impl(
  auto inv_ptr = inv.data<T>();
  if (tri) {
    encoder.dispatch([inv_ptr, N, num_matrices, upper]() {
-      for (int i = 0; i < num_matrices; i++) {
+      for (int64_t i = 0; i < num_matrices; i++) {
        tri_inv<T>(inv_ptr + N * N * i, N, upper);
      }
    });
  } else {
    encoder.dispatch([inv_ptr, N, num_matrices]() {
-      for (int i = 0; i < num_matrices; i++) {
+      for (int64_t i = 0; i < num_matrices; i++) {
        general_inv<T>(inv_ptr + N * N * i, N);
      }
    });
--- a/mlx/backend/cpu/jit_compiler.cpp
+++ b/mlx/backend/cpu/jit_compiler.cpp
@@ -2,6 +2,7 @@

 #include "mlx/backend/cpu/jit_compiler.h"

+#include <algorithm>
 #include <sstream>
 #include <vector>

--- a/mlx/backend/cpu/lapack.h
+++ b/mlx/backend/cpu/lapack.h
@@ -2,14 +2,14 @@

 #pragma once

-// Required for Visual Studio.
-// https://github.com/OpenMathLib/OpenBLAS/blob/develop/docs/install.md
-#ifdef _MSC_VER
 #include <complex>
 #define LAPACK_COMPLEX_CUSTOM
 #define lapack_complex_float std::complex<float>
 #define lapack_complex_double std::complex<double>
-#endif
+#define lapack_complex_float_real(z) ((z).real())
+#define lapack_complex_float_imag(z) ((z).imag())
+#define lapack_complex_double_real(z) ((z).real())
+#define lapack_complex_double_imag(z) ((z).imag())

 #ifdef MLX_USE_ACCELERATE
 #include <Accelerate/Accelerate.h>
@@ -32,7 +32,7 @@

 #endif

-#define INSTANTIATE_LAPACK_TYPES(FUNC)                       \
+#define INSTANTIATE_LAPACK_REAL(FUNC)                        \
  template <typename T, typename... Args>                    \
  void FUNC(Args... args) {                                  \
    if constexpr (std::is_same_v<T, float>) {                \
@@ -42,11 +42,24 @@
    }                                                        \
  }

-INSTANTIATE_LAPACK_TYPES(geqrf)
-INSTANTIATE_LAPACK_TYPES(orgqr)
-INSTANTIATE_LAPACK_TYPES(syevd)
-INSTANTIATE_LAPACK_TYPES(potrf)
-INSTANTIATE_LAPACK_TYPES(gesvdx)
-INSTANTIATE_LAPACK_TYPES(getrf)
-INSTANTIATE_LAPACK_TYPES(getri)
-INSTANTIATE_LAPACK_TYPES(trtri)
+INSTANTIATE_LAPACK_REAL(geqrf)
+INSTANTIATE_LAPACK_REAL(orgqr)
+INSTANTIATE_LAPACK_REAL(syevd)
+INSTANTIATE_LAPACK_REAL(geev)
+INSTANTIATE_LAPACK_REAL(potrf)
+INSTANTIATE_LAPACK_REAL(gesdd)
+INSTANTIATE_LAPACK_REAL(getrf)
+INSTANTIATE_LAPACK_REAL(getri)
+INSTANTIATE_LAPACK_REAL(trtri)
+
+#define INSTANTIATE_LAPACK_COMPLEX(FUNC)                            \
+  template <typename T, typename... Args>                           \
+  void FUNC(Args... args) {                                         \
+    if constexpr (std::is_same_v<T, std::complex<float>>) {         \
+      MLX_LAPACK_FUNC(c##FUNC)(std::forward<Args>(args)...);        \
+    } else if constexpr (std::is_same_v<T, std::complex<double>>) { \
+      MLX_LAPACK_FUNC(z##FUNC)(std::forward<Args>(args)...);        \
+    }                                                               \
+  }
+
+INSTANTIATE_LAPACK_COMPLEX(heevd)
--- a/mlx/backend/cpu/logsumexp.cpp
+++ b/mlx/backend/cpu/logsumexp.cpp
@@ -87,8 +87,7 @@ void LogSumExp::eval_cpu(const std::vector<array>& inputs, array& out) {
    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
      return x;
    } else {
-      auto x_copy = array(x.shape(), x.dtype(), nullptr, {});
-      copy(x, x_copy, CopyType::General, s);
+      array x_copy = contiguous_copy_cpu(x, s);
      encoder.add_temporary(x_copy);
      return x_copy;
    }
--- a/mlx/backend/cpu/luf.cpp
+++ b/mlx/backend/cpu/luf.cpp
@@ -31,7 +31,7 @@ void luf_impl(
  strides[ndim - 1] = M;
  strides[ndim - 2] = 1;
  lu.set_data(allocator::malloc(lu.nbytes()), lu.nbytes(), strides, flags);
-  copy_inplace(
+  copy_cpu_inplace(
      a,
      lu,
      a.shape(),
--- a/mlx/backend/cpu/masked_mm.cpp
+++ b/mlx/backend/cpu/masked_mm.cpp
@@ -6,6 +6,7 @@
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
+#include "mlx/backend/cpu/gemm.h"
 #include "mlx/backend/cpu/lapack.h"
 #include "mlx/primitives.h"

@@ -24,7 +25,7 @@ inline void mask_matrix(
    const int64_t Y_data_str,
    const int64_t X_mask_str,
    const int64_t Y_mask_str,
-    const size_t mask_offset) {
+    const int64_t mask_offset) {
  int tX = (X + block_size - 1) / block_size;
  int tY = (Y + block_size - 1) / block_size;

@@ -52,6 +53,58 @@ inline void mask_matrix(
  }
 }

+template <typename T>
+inline void segmented_mm(
+    const T* a,
+    const T* b,
+    const uint32_t* segments,
+    T* out,
+    bool a_transposed,
+    bool b_transposed,
+    int64_t lda,
+    int64_t ldb,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides,
+    int64_t num_segments,
+    const Shape& segments_shape,
+    const Strides& segments_strides) {
+  int ndim = a_shape.size();
+  Shape a_copy = a_shape;
+  Shape b_copy = b_shape;
+  int32_t M = a_copy[ndim - 2];
+  int32_t N = b_copy[ndim - 1];
+  for (int i = 0; i < num_segments; i++) {
+    uint32_t k_start =
+        segments[elem_to_loc(2 * i, segments_shape, segments_strides)];
+    uint32_t k_end =
+        segments[elem_to_loc(2 * i + 1, segments_shape, segments_strides)];
+    if (k_end <= k_start) {
+      std::fill_n(out + i * M * N, M * N, T(0));
+      continue;
+    }
+    a_copy[ndim - 1] = k_end - k_start;
+    b_copy[ndim - 2] = k_end - k_start;
+    matmul<T>(
+        a + k_start * a_strides[ndim - 1],
+        b + k_start * b_strides[ndim - 2],
+        out + i * M * N,
+        a_transposed,
+        b_transposed,
+        lda,
+        ldb,
+        N,
+        1.0,
+        0.0,
+        1,
+        a_copy,
+        a_strides,
+        b_copy,
+        b_strides);
+  }
+}
+
 } // namespace

 void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -71,21 +124,20 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
        if (!expand_all && stx == arr.shape(-1) && sty == 1) {
          if (do_copy) {
            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-            copy(arr, arr_copy, CopyType::Vector, s);
+            copy_cpu(arr, arr_copy, CopyType::Vector, s);
            return std::make_tuple(false, stx, arr_copy, true);
          }
          return std::make_tuple(false, stx, arr, false);
        } else if (!expand_all && stx == 1 && sty == arr.shape(-2)) {
          if (do_copy) {
            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-            copy(arr, arr_copy, CopyType::Vector, s);
+            copy_cpu(arr, arr_copy, CopyType::Vector, s);
            return std::make_tuple(true, sty, arr_copy, true);
          }
          return std::make_tuple(true, sty, arr, false);
        } else {
-          array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-          copy(arr, arr_copy, CopyType::General, s);
          int64_t stx = arr.shape(-1);
+          array arr_copy = contiguous_copy_cpu(arr, s);
          return std::make_tuple(false, stx, arr_copy, true);
        }
      };
@@ -97,9 +149,9 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto [b_transposed, ldb, b, b_copied] =
      check_transpose(b_pre, has_op_mask, inputs.back().dtype() != bool_);

-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
+  int64_t M = a.shape(-2);
+  int64_t N = b.shape(-1);
+  int64_t K = a.shape(-1);

  if (M == 0 || N == 0) {
    return;
@@ -120,8 +172,8 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
                       int batch_idx,
                       int X,
                       int Y,
-                       size_t X_data_str,
-                       size_t Y_data_str,
+                       int64_t X_data_str,
+                       int64_t Y_data_str,
                       const Shape& mask_shape,
                       const Strides& mask_strides,
                       bool is_bool) {
@@ -163,18 +215,18 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {

  encoder.set_input_array(a);
  encoder.set_input_array(b);
-  const void* a_mask_ptr;
-  const void* b_mask_ptr;
-  const void* out_mask_ptr;
+  const void* a_mask_ptr = nullptr;
+  const void* b_mask_ptr = nullptr;
+  const void* out_mask_ptr = nullptr;
  Shape a_mask_shape;
  Shape b_mask_shape;
  Shape out_mask_shape;
  Strides a_mask_strides;
  Strides b_mask_strides;
  Strides out_mask_strides;
-  bool a_mask_bool;
-  bool b_mask_bool;
-  bool out_mask_bool;
+  bool a_mask_bool = false;
+  bool b_mask_bool = false;
+  bool out_mask_bool = false;
  if (has_op_mask) {
    auto& a_mask = inputs[inputs.size() - 2];
    auto& b_mask = inputs[inputs.size() - 1];
@@ -201,7 +253,7 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto a_ptr = a.data<float>();
  auto b_ptr = b.data<float>();
  auto out_ptr = out.data<float>();
-  size_t num_matrices = out.size() / (M * size_t(N));
+  int64_t num_matrices = out.size() / (M * int64_t(N));
  auto ldc = out.shape(-1);

  encoder.dispatch([a_ptr,
@@ -333,7 +385,7 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
      return std::make_tuple(true, sty, arr);
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy(arr, temps.back(), CopyType::General, s);
+      copy_cpu(arr, temps.back(), CopyType::General, s);
      int64_t stx = arr.shape(-1);
      return std::make_tuple(false, stx, temps.back());
    }
@@ -342,9 +394,9 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);

-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
+  int64_t M = a.shape(-2);
+  int64_t N = b.shape(-1);
+  int64_t K = a.shape(-1);

  if (M == 0 || N == 0) {
    return;
@@ -361,7 +413,7 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {

  // Get batch dims
  auto batch_size_out = out.size() / (M * N);
-  size_t matrix_stride_out = M * N;
+  int64_t matrix_stride_out = M * N;

  auto get_batch_dims = [](const auto& v) {
    return decltype(v){v.begin(), v.end() - 2};
@@ -371,7 +423,6 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& rhs_indices = inputs[3];

  auto batch_shape = get_batch_dims(out.shape());
-  int batch_ndim = batch_shape.size();

  auto batch_shape_A = get_batch_dims(a.shape());
  auto batch_strides_A = get_batch_dims(a.strides());
@@ -437,4 +488,121 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  encoder.add_temporaries(std::move(temps));
 }

+void SegmentedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  auto& s = stream();
+  auto& encoder = cpu::get_command_encoder(stream());
+  auto check_transpose = [&s, &encoder](const array& x) {
+    auto stx = x.strides()[x.ndim() - 2];
+    auto sty = x.strides()[x.ndim() - 1];
+    if (stx == x.shape(-1) && sty == 1) {
+      return std::make_tuple(false, stx, x);
+    } else if (stx == 1 && sty == x.shape(-2)) {
+      return std::make_tuple(true, sty, x);
+    } else {
+      array xc(x.shape(), x.dtype(), nullptr, {});
+      copy_cpu(x, xc, CopyType::General, s);
+      encoder.add_temporary(xc);
+      int64_t stx = x.shape(-1);
+      return std::make_tuple(false, stx, xc);
+    }
+  };
+
+  auto [a_transposed, lda, a] = check_transpose(inputs[0]);
+  auto [b_transposed, ldb, b] = check_transpose(inputs[1]);
+  auto& segments = inputs[2];
+
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_input_array(segments);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    segments = array::unsafe_weak_copy(segments),
+                    out_ptr = out.data<void>(),
+                    a_transposed = a_transposed,
+                    b_transposed = b_transposed,
+                    lda = lda,
+                    ldb = ldb]() {
+    switch (a.dtype()) {
+      case float64:
+        segmented_mm<double>(
+            a.data<double>(),
+            b.data<double>(),
+            segments.data<uint32_t>(),
+            static_cast<double*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      case float32:
+        segmented_mm<float>(
+            a.data<float>(),
+            b.data<float>(),
+            segments.data<uint32_t>(),
+            static_cast<float*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      case float16:
+        segmented_mm<float16_t>(
+            a.data<float16_t>(),
+            b.data<float16_t>(),
+            segments.data<uint32_t>(),
+            static_cast<float16_t*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      case bfloat16:
+        segmented_mm<bfloat16_t>(
+            a.data<bfloat16_t>(),
+            b.data<bfloat16_t>(),
+            segments.data<uint32_t>(),
+            static_cast<bfloat16_t*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      default:
+        throw std::invalid_argument(
+            "Segmented mm supports only real float types.");
+    }
+  });
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/matmul.cpp
+++ b/mlx/backend/cpu/matmul.cpp
@@ -81,7 +81,7 @@ void matmul_general(
      return std::make_tuple(true, sty, arr);
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy(arr, temps.back(), CopyType::General, stream);
+      copy_cpu(arr, temps.back(), CopyType::General, stream);
      stx = arr.shape(-1);
      return std::make_tuple(false, stx, temps.back());
    }
@@ -91,7 +91,6 @@ void matmul_general(
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
  size_t M = a.shape(-2);
  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
  if (M == 0 || N == 0) {
    return;
  }
@@ -108,6 +107,9 @@ void matmul_general(
  } else if (out.dtype() == float64) {
    matmul_dispatch<double>(
        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
+  } else if (out.dtype() == complex64) {
+    matmul_dispatch<complex64_t>(
+        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
  } else {
    throw std::runtime_error("[Matmul::eval_cpu] Invalid type.");
  }
@@ -128,9 +130,9 @@ void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
 }

 void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
-  if (out.dtype() != float32) {
-    throw std::runtime_error(
-        "[AddMM::eval_cpu] Currently only supports float32.");
+  if (out.size() == 0) {
+    out.set_data(allocator::malloc(out.nbytes()));
+    return;
  }

  // Fill output with C
@@ -138,8 +140,10 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  CopyType ctype = c.data_size() == 1
      ? CopyType::Scalar
      : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
-  copy(c, out, ctype, stream());
-
+  copy_cpu(c, out, ctype, stream());
+  if (inputs[0].shape(-1) == 0) {
+    return;
+  }
  matmul_general(inputs[0], inputs[1], out, stream(), alpha_, beta_);
 }

--- a/mlx/backend/cpu/primitives.cpp
+++ b/mlx/backend/cpu/primitives.cpp
@@ -22,7 +22,7 @@ void reshape(const array& in, array& out) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
  if (copy_necessary) {
    out.set_data(allocator::malloc(out.nbytes()));
-    copy_inplace(in, out, CopyType::General, out.primitive().stream());
+    copy_cpu_inplace(in, out, CopyType::General, out.primitive().stream());
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
@@ -48,7 +48,7 @@ static std::pair<array, bool> compute_dynamic_offset(
  auto compute_offset =
      [strides, axes, offset = offset.data<int64_t>()](const auto* indices) {
        int64_t offset_ = 0;
-        for (int i = 0; i < axes.size(); ++i) {
+        for (int i = 0; i < std::ssize(axes); ++i) {
          offset_ += indices[i] * strides[axes[i]];
        }
        offset[0] = offset_;
@@ -124,6 +124,7 @@ void Transpose::eval_cpu(const std::vector<array>& inputs, array& out) {

 void Arange::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 0);
+  (void)inputs;
  out.set_data(allocator::malloc(out.nbytes()));
  switch (out.dtype()) {
    case bool_:
@@ -175,7 +176,7 @@ void AsType::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
-  copy(in, out, ctype, stream());
+  copy_cpu(in, out, ctype, stream());
 }

 void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -193,25 +194,25 @@ void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
  flags.row_contiguous = false;
  flags.col_contiguous = false;
  flags.contiguous = false;
-  for (int i = 0; i < inputs.size(); i++) {
+  for (int i = 0; i < std::ssize(inputs); i++) {
    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
-    size_t data_offset = strides[axis_] * sizes[i];
+    int64_t data_offset = strides[axis_] * sizes[i];
    out_slice.copy_shared_buffer(
        out, strides, flags, out_slice.size(), data_offset);
-    copy_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, stream());
+    copy_cpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, stream());
  }
 }

 void Contiguous::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
-  constexpr size_t extra_bytes = 16384;
+  constexpr int64_t extra_bytes = 16384;
  if (in.buffer_size() <= out.nbytes() + extra_bytes &&
      (in.flags().row_contiguous ||
       (allow_col_major_ && in.flags().col_contiguous))) {
    out.copy_shared_buffer(in);
  } else {
-    copy(in, out, CopyType::General, stream());
+    copy_cpu(in, out, CopyType::General, stream());
  }
 }

@@ -235,7 +236,7 @@ void Full::eval_cpu(const std::vector<array>& inputs, array& out) {
  } else {
    ctype = CopyType::General;
  }
-  copy(in, out, ctype, stream());
+  copy_cpu(in, out, ctype, stream());
 }

 void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -251,11 +252,11 @@ void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(val.dtype() == in.dtype() && in.dtype() == out.dtype());

  // Fill output with val
-  copy(val, out, CopyType::Scalar, stream());
+  copy_cpu(val, out, CopyType::Scalar, stream());

  // Find offset for start of input values
-  size_t data_offset = 0;
-  for (int i = 0; i < axes_.size(); i++) {
+  int64_t data_offset = 0;
+  for (int i = 0; i < std::ssize(axes_); i++) {
    auto ax = axes_[i] < 0 ? out.ndim() + axes_[i] : axes_[i];
    data_offset += out.strides()[ax] * low_pad_size_[i];
  }
@@ -266,7 +267,7 @@ void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
      out, out.strides(), out.flags(), out_slice.size(), data_offset);

  // Copy input values into the slice
-  copy_inplace(in, out_slice, CopyType::GeneralGeneral, stream());
+  copy_cpu_inplace(in, out_slice, CopyType::GeneralGeneral, stream());
 }

 void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -274,10 +275,10 @@ void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
  // keys has shape (N1, ..., NK, 2)
  // out has shape (N1, ..., NK, M1, M2, ...)
  auto& keys = inputs[0];
-  size_t num_keys = keys.size() / 2;
+  int64_t num_keys = keys.size() / 2;

-  size_t elems_per_key = out.size() / num_keys;
-  size_t bytes_per_key = out.itemsize() * elems_per_key;
+  int64_t elems_per_key = out.size() / num_keys;
+  int64_t bytes_per_key = out.itemsize() * elems_per_key;
  out.set_data(allocator::malloc(out.nbytes()));

  auto kptr = inputs[0].data<uint32_t>();
@@ -291,8 +292,8 @@ void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
                    num_keys,
                    kshape = keys.shape(),
                    kstrides = keys.strides()]() mutable {
-    size_t out_skip = (bytes_per_key + 4 - 1) / 4;
-    auto half_size = out_skip / 2;
+    int64_t out_skip = (bytes_per_key + 4 - 1) / 4;
+    uintptr_t half_size = out_skip / 2;
    bool even = out_skip % 2 == 0;
    for (int i = 0; i < num_keys; ++i, cptr += bytes_per_key) {
      auto ptr = reinterpret_cast<uint32_t*>(cptr);
@@ -340,7 +341,7 @@ void DynamicSlice::eval_cpu(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));
  auto [in_offset, donated] =
      compute_dynamic_offset(inputs[1], in.strides(), axes_, stream());
-  copy_inplace(
+  copy_cpu_inplace(
      /* const array& src = */ in,
      /* array& dst = */ out,
      /* const Shape& data_shape = */ out.shape(),
@@ -372,11 +373,11 @@ void DynamicSliceUpdate::eval_cpu(
  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
-  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());
+  copy_cpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());

  auto [out_offset, donated] =
      compute_dynamic_offset(inputs[2], out.strides(), axes_, stream());
-  copy_inplace(
+  copy_cpu_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const std::vector<int>& data_shape = */ upd.shape(),
@@ -412,14 +413,14 @@ void SliceUpdate::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
-  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());
+  copy_cpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());

  // Calculate out strides, initial offset and if copy needs to be made
  auto [data_offset, out_strides] =
      prepare_slice(out, start_indices_, strides_);

  // Do copy
-  copy_inplace(
+  copy_cpu_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const std::vector<int>& data_shape = */ upd.shape(),
@@ -456,9 +457,9 @@ void View::eval_cpu(const std::vector<array>& inputs, array& out) {
    if (in.dtype() == bool_) {
      auto in_tmp = array(in.shape(), uint8, nullptr, {});
      in_tmp.copy_shared_buffer(in);
-      copy_inplace(in_tmp, tmp, CopyType::General, stream());
+      copy_cpu_inplace(in_tmp, tmp, CopyType::General, stream());
    } else {
-      copy_inplace(in, tmp, CopyType::General, stream());
+      copy_cpu_inplace(in, tmp, CopyType::General, stream());
    }

    auto flags = out.flags();
--- a/mlx/backend/cpu/qrf.cpp
+++ b/mlx/backend/cpu/qrf.cpp
@@ -13,7 +13,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
  const int M = a.shape(-2);
  const int N = a.shape(-1);
  const int lda = M;
-  size_t num_matrices = a.size() / (M * N);
+  int64_t num_matrices = a.size() / (M * N);

  // Copy A to inplace input and make it col-contiguous
  array in(a.shape(), a.dtype(), nullptr, {});
@@ -26,7 +26,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
  strides[in.ndim() - 2] = 1;
  strides[in.ndim() - 1] = M;
  in.set_data(allocator::malloc(in.nbytes()), in.nbytes(), strides, flags);
-  copy_inplace(a, in, CopyType::GeneralGeneral, stream);
+  copy_cpu_inplace(a, in, CopyType::GeneralGeneral, stream);
  auto& encoder = cpu::get_command_encoder(stream);
  q.set_data(allocator::malloc(q.nbytes()));
  r.set_data(allocator::malloc(r.nbytes()));
@@ -54,7 +54,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
    auto work = allocator::malloc(sizeof(T) * lwork);

    // Loop over matrices
-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      // Solve
      geqrf<T>(
          &M,
@@ -68,7 +68,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
    }
    allocator::free(work);

-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      /// num_reflectors x N
      for (int j = 0; j < num_reflectors; ++j) {
        for (int k = 0; k < j; ++k) {
@@ -97,7 +97,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
    work = allocator::malloc(sizeof(T) * lwork);

    // Loop over matrices
-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      // Compute Q
      orgqr<T>(
          &M,
@@ -111,7 +111,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
          &info);
    }

-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      // M x num_reflectors
      for (int j = 0; j < M; ++j) {
        for (int k = 0; k < num_reflectors; ++k) {
--- a/mlx/backend/cpu/quantized.cpp
+++ b/mlx/backend/cpu/quantized.cpp
@@ -1,7 +1,5 @@
 // Copyright © 2023 Apple Inc.

-#include <cassert>
-
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"
@@ -13,9 +11,47 @@ namespace mlx::core {

 namespace {

+const static float MXFP4_LUT[16] = {
+    +0.0f,
+    +0.5f,
+    +1.0f,
+    +1.5f,
+    +2.0f,
+    +3.0f,
+    +4.0f,
+    +6.0f,
+    -0.0f,
+    -0.5f,
+    -1.0f,
+    -1.5f,
+    -2.0f,
+    -3.0f,
+    -4.0f,
+    -6.0f};
+
+template <typename T>
+static inline T dequantize_scale(uint8_t s) {
+  using FOrI = union {
+    bfloat16_t f;
+    uint16_t i;
+  };
+  FOrI out;
+  out.i = (s == 0 ? 0x40 : (static_cast<uint16_t>(s) << 7));
+  return static_cast<T>(out.f);
+}
+
+inline constexpr short get_pack_factor(int bits, int wsize = 8) {
+  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
+}
+
+inline constexpr short get_bytes_per_pack(int bits, int wsize = 8) {
+  auto power_of_2_bits = (bits & (bits - 1)) == 0;
+  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
+}
+
 template <typename T, int bits>
 void extract_bits(const uint8_t* w_in, T* w_out) {
-  assert(bits == 3 || bits == 6);
+  static_assert(bits == 3 || bits == 5 || bits == 6);
  if (bits == 3) {
    w_out[0] = static_cast<T>(w_in[0] & 0x7);
    w_out[1] = static_cast<T>((w_in[0] & 0x38) >> 3);
@@ -25,6 +61,16 @@ void extract_bits(const uint8_t* w_in, T* w_out) {
    w_out[5] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0x3) << 1));
    w_out[6] = static_cast<T>((w_in[2] & 0x1c) >> 2);
    w_out[7] = static_cast<T>((w_in[2] & 0xe0) >> 5);
+  } else if (bits == 5) {
+    w_out[0] = static_cast<T>(w_in[0] & 0x1f);
+    w_out[1] = static_cast<T>(((w_in[0] & 0xe0) >> 5) + ((w_in[1] & 0x3) << 3));
+    w_out[2] = static_cast<T>((w_in[1] & 0x7c) >> 2);
+    w_out[3] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0xf) << 1));
+    w_out[4] = static_cast<T>(((w_in[2] & 0xf0) >> 4) + ((w_in[3] & 0x1) << 4));
+    w_out[5] = static_cast<T>((w_in[3] & 0x3e) >> 1);
+    w_out[6] = static_cast<T>(((w_in[3] & 0xc0) >> 6) + ((w_in[4] & 0x7) << 2));
+    w_out[7] = static_cast<T>((w_in[4] & 0xf8) >> 3);
+
  } else if (bits == 6) {
    w_out[0] = static_cast<T>(w_in[0] & 0x3f);
    w_out[1] =
@@ -46,8 +92,8 @@ void _qmm(
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
-  constexpr int pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
-  constexpr int bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;
+  constexpr int pack_factor = get_pack_factor(bits, 8);
+  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
@@ -65,7 +111,7 @@ void _qmm(
        T scale = *scales_local++;
        T bias = *biases_local++;
        for (int ng = 0; ng < packs_in_group; ng++) {
-          if (bits == 3 || bits == 6) {
+          if constexpr (bits == 3 || bits == 5 || bits == 6) {
            T wl[pack_factor];
            extract_bits<T, bits>(w_local, wl);
 #pragma clang loop unroll(full)
@@ -104,8 +150,9 @@ void _qmm_t(
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
-  constexpr int pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
-  constexpr int bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;
+
+  constexpr int pack_factor = get_pack_factor(bits, 8);
+  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
@@ -121,7 +168,7 @@ void _qmm_t(
        T bias = *biases_local++;

        for (int kw = 0; kw < packs_in_group; kw++) {
-          if (bits == 3 || bits == 6) {
+          if constexpr (bits == 3 || bits == 5 || bits == 6) {
            T wl[pack_factor];
            extract_bits<T, bits>(w_local, wl);
 #pragma clang loop unroll(full)
@@ -304,6 +351,10 @@ void _qmm_dispatch_typed(
      _qmm_dispatch_group<T, 4>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
      break;
+    case 5:
+      _qmm_dispatch_group<T, 5>(
+          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
+      break;
    case 6:
      _qmm_dispatch_group<T, 6>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
@@ -383,6 +434,229 @@ void _qmm_dispatch(
  }
 }

+template <typename T>
+void mxfp4_qmm(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const uint8_t* scales,
+    int M,
+    int N,
+    int K) {
+  constexpr int group_size = 32;
+  constexpr int pack_factor = get_pack_factor(4, 8);
+  constexpr int packs_in_group = group_size / pack_factor;
+
+  for (int m = 0; m < M; m++) {
+    const uint8_t* w_local = (const uint8_t*)w;
+    const uint8_t* scales_local = scales;
+
+    std::fill(result, result + N, 0);
+
+    for (int k = 0; k < K; k++) {
+      T* result_local = result;
+      T xi = *x++;
+
+      for (int n = 0; n < N; n += group_size) {
+        T scale = dequantize_scale<T>(*scales_local++);
+        for (int ng = 0; ng < packs_in_group; ng++) {
+          uint8_t wi = *w_local++;
+#pragma clang loop unroll(full)
+          for (int p = 0; p < pack_factor; p++) {
+            (*result_local++) +=
+                xi * scale * static_cast<T>(MXFP4_LUT[wi & 0xf]);
+            wi >>= 4;
+          }
+        }
+      }
+    }
+
+    result += N;
+  }
+}
+
+template <typename T>
+void mxfp4_qmm_t(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const uint8_t* scales,
+    int M,
+    int N,
+    int K) {
+  constexpr int group_size = 32;
+  constexpr int pack_factor = get_pack_factor(4, 8);
+  constexpr int packs_in_group = group_size / pack_factor;
+
+  for (int m = 0; m < M; m++) {
+    const uint8_t* w_local = (const uint8_t*)w;
+    const uint8_t* scales_local = scales;
+
+    for (int n = 0; n < N; n++) {
+      const T* x_local = x;
+      T sum = 0;
+      for (int k = 0; k < K; k += group_size) {
+        T scale = dequantize_scale<T>(*scales_local++);
+
+        T gsum = 0;
+        for (int kw = 0; kw < packs_in_group; kw++) {
+          uint8_t wi = *w_local++;
+#pragma clang loop unroll(full)
+          for (int p = 0; p < pack_factor; p++) {
+            gsum += (*x_local++) * static_cast<T>(MXFP4_LUT[wi & 0xf]);
+            wi >>= 4;
+          }
+        }
+        sum += scale * gsum;
+      }
+      *result = sum;
+      result++;
+    }
+
+    x += K;
+  }
+}
+
+template <int S>
+simd::Simd<float, S> mxfp4_extract_bits_simd(const uint32_t* w) {
+  if constexpr (S == 8) {
+    constexpr std::array<uint32_t, 8> shifts_ = {{0, 4, 8, 12, 16, 20, 24, 28}};
+    auto shifts(*(simd::Simd<uint32_t, S>*)&shifts_);
+    auto wi = simd::Simd<uint32_t, S>(*w);
+    wi = wi >> shifts;
+    wi = wi & 0xf;
+    simd::Simd<float, S> w_out;
+    for (int i = 0; i < S; ++i) {
+      w_out[i] = MXFP4_LUT[wi[i]];
+    }
+    return w_out;
+  } else {
+    // Appease compiler.. but should never get here
+    throw std::runtime_error("Unsupported combination for simd qmm.");
+  }
+}
+
+template <typename T>
+void mxfp4_qmm_t_simd(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const uint8_t* scales,
+    int M,
+    int N,
+    int K) {
+  constexpr int group_size = 32;
+  constexpr int pack_factor = 32 / 4;
+  constexpr int packs_in_group = group_size / pack_factor;
+  constexpr int S = simd::max_size<T>;
+  static_assert(
+      S % pack_factor == 0, "SIMD size must be divisible by pack factor");
+  constexpr int packs_per_simd = S / pack_factor;
+
+  for (int m = 0; m < M; m++) {
+    const uint32_t* w_local = w;
+    const uint8_t* scales_local = scales;
+
+    for (int n = 0; n < N; n++) {
+      simd::Simd<float, S> acc(0);
+      auto x_local = x;
+      for (int k = 0; k < K; k += group_size) {
+        T scale = dequantize_scale<T>(*scales_local++);
+
+        simd::Simd<float, S> g_acc(0);
+        for (int kw = 0; kw < packs_in_group; kw += packs_per_simd) {
+          // Extract bits
+          auto wf = mxfp4_extract_bits_simd<S>(w_local);
+          w_local += packs_per_simd;
+          simd::Simd<float, S> x_simd = simd::load<T, S>(x_local);
+          g_acc = g_acc + x_simd * wf;
+          x_local += S;
+        }
+        acc = acc + scale * g_acc;
+      }
+
+      *result = T(simd::sum(acc));
+      result++;
+    }
+    x += K;
+  }
+}
+
+template <typename T>
+void mxfp4_qmm_dispatch_transpose(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const uint8_t* scales,
+    int M,
+    int N,
+    int K,
+    bool transposed_w) {
+  if (transposed_w) {
+    // the simd size must be a multiple of the number of elements per word
+    if constexpr (simd::max_size<T> % 8 == 0) {
+      mxfp4_qmm_t_simd<T>(result, x, w, scales, M, N, K);
+    } else {
+      mxfp4_qmm_t<T>(result, x, w, scales, M, N, K);
+    }
+  } else {
+    mxfp4_qmm<T>(result, x, w, scales, M, N, K);
+  }
+}
+
+template <typename T>
+void mxfp4_qmm_dispatch_typed(
+    array& out,
+    const array& x,
+    const array& w,
+    const array& scales,
+    bool transposed_w) {
+  int K = x.shape(-1);
+  int M = x.ndim() > 1 ? x.shape(-2) : 1;
+  int N = out.shape(-1);
+  int w_els = w.ndim() > 2 ? w.shape(-1) * w.shape(-2) : 0;
+  int g_els = w.ndim() > 2 ? scales.shape(-1) * scales.shape(-2) : 0;
+  int batch_size = x.size() / (K * M);
+
+  auto out_ptr = out.data<T>();
+  auto x_ptr = x.data<T>();
+  auto w_ptr = w.data<uint32_t>();
+  auto scales_ptr = scales.data<uint8_t>();
+  for (int i = 0; i < batch_size; i++) {
+    mxfp4_qmm_dispatch_transpose<T>(
+        out_ptr + i * M * N,
+        x_ptr + elem_to_loc(i * M * K, x.shape(), x.strides()),
+        w_ptr + elem_to_loc(i * w_els, w.shape(), w.strides()),
+        scales_ptr + elem_to_loc(i * g_els, scales.shape(), scales.strides()),
+        M,
+        N,
+        K,
+        transposed_w);
+  }
+}
+
+void mxfp4_qmm_dispatch(
+    array& out,
+    const array& x,
+    const array& w,
+    const array& scales,
+    bool transposed_w) {
+  switch (x.dtype()) {
+    case bfloat16:
+      mxfp4_qmm_dispatch_typed<bfloat16_t>(out, x, w, scales, transposed_w);
+      break;
+    case float16:
+      mxfp4_qmm_dispatch_typed<float16_t>(out, x, w, scales, transposed_w);
+      break;
+    case float32:
+      mxfp4_qmm_dispatch_typed<float>(out, x, w, scales, transposed_w);
+      break;
+    default:
+      throw std::invalid_argument(
+          "[quantized_matmul] only floating types are supported");
+  }
+}
+
 template <typename T>
 void _bs_qmm_dispatch_typed(
    array& out,
@@ -489,115 +763,198 @@ void _bs_qmm_dispatch(
  }
 }

+template <typename T>
+void mxfp4_bs_qmm_dispatch_typed(
+    array& out,
+    const array& x,
+    const array& w,
+    const array& scales,
+    const array& lhs_indices,
+    const array& rhs_indices,
+    bool transposed_w) {
+  int K = x.shape(-1);
+  int M = x.shape(-2);
+  int N = out.shape(-1);
+
+  int w_els = w.shape(-1) * w.shape(-2);
+  int g_els = scales.shape(-1) * scales.shape(-2);
+
+  auto out_ptr = out.data<T>();
+  auto x_ptr = x.data<T>();
+  auto w_ptr = w.data<uint32_t>();
+  auto scales_ptr = scales.data<uint8_t>();
+  auto lhs_indices_ptr = lhs_indices.data<uint32_t>();
+  auto rhs_indices_ptr = rhs_indices.data<uint32_t>();
+
+  for (int i = 0; i < lhs_indices.size(); i++) {
+    int x_idx = lhs_indices_ptr[elem_to_loc(
+        i, lhs_indices.shape(), lhs_indices.strides())];
+    int w_idx = rhs_indices_ptr[elem_to_loc(
+        i, rhs_indices.shape(), rhs_indices.strides())];
+    mxfp4_qmm_dispatch_transpose<T>(
+        out_ptr + i * M * N,
+        x_ptr + elem_to_loc(x_idx * M * K, x.shape(), x.strides()),
+        w_ptr + elem_to_loc(w_idx * w_els, w.shape(), w.strides()),
+        scales_ptr +
+            elem_to_loc(w_idx * g_els, scales.shape(), scales.strides()),
+        M,
+        N,
+        K,
+        transposed_w);
+  }
+}
+
+void mxfp4_bs_qmm_dispatch(
+    array& out,
+    const array& x,
+    const array& w,
+    const array& scales,
+    const array& lhs_indices,
+    const array& rhs_indices,
+    bool transposed_w) {
+  switch (x.dtype()) {
+    case float32:
+      mxfp4_bs_qmm_dispatch_typed<float>(
+          out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
+      break;
+    case float16:
+      mxfp4_bs_qmm_dispatch_typed<float16_t>(
+          out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
+      break;
+    case bfloat16:
+      mxfp4_bs_qmm_dispatch_typed<bfloat16_t>(
+          out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
+      break;
+    default:
+      throw std::invalid_argument(
+          "[quantized_matmul] only floating types are supported");
+  }
+}
+
 } // namespace

 void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 4);
-
  auto& x_pre = inputs[0];
  auto& w_pre = inputs[1];
  auto& scales_pre = inputs[2];
-  auto& biases_pre = inputs[3];

-  std::vector<array> temps;
-  auto ensure_row_contiguous = [s = stream(), &temps](const array& arr) {
+  auto& encoder = cpu::get_command_encoder(stream());
+  auto ensure_row_contiguous = [s = stream(), &encoder](const array& arr) {
    if (arr.flags().row_contiguous) {
      return arr;
    } else {
-      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy(arr, temps.back(), CopyType::General, s);
-      return temps.back();
+      auto arr_cpy = array(arr.shape(), arr.dtype(), nullptr, {});
+      copy_cpu(arr, arr_cpy, CopyType::General, s);
+      encoder.add_temporary(arr_cpy);
+      return arr_cpy;
    }
  };

  auto x = ensure_row_contiguous(x_pre);
  auto w = ensure_row_contiguous(w_pre);
  auto scales = ensure_row_contiguous(scales_pre);
-  auto biases = ensure_row_contiguous(biases_pre);

  out.set_data(allocator::malloc(out.nbytes()));

-  auto& encoder = cpu::get_command_encoder(stream());
-  encoder.add_temporaries(std::move(temps));
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(scales);
-  encoder.set_input_array(biases);
  encoder.set_output_array(out);
-  encoder.dispatch([out = array::unsafe_weak_copy(out),
-                    x = array::unsafe_weak_copy(x),
-                    w = array::unsafe_weak_copy(w),
-                    scales = array::unsafe_weak_copy(scales),
-                    biases = array::unsafe_weak_copy(biases),
-                    group_size_ = group_size_,
-                    bits_ = bits_,
-                    transpose_ = transpose_]() mutable {
-    _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
-  });
+  if (mode_ == QuantizationMode::Affine) {
+    auto biases = ensure_row_contiguous(inputs[3]);
+    encoder.set_input_array(biases);
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
+                      x = array::unsafe_weak_copy(x),
+                      w = array::unsafe_weak_copy(w),
+                      scales = array::unsafe_weak_copy(scales),
+                      biases = array::unsafe_weak_copy(biases),
+                      group_size_ = group_size_,
+                      bits_ = bits_,
+                      transpose_ = transpose_]() mutable {
+      _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
+    });
+  } else {
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
+                      x = array::unsafe_weak_copy(x),
+                      w = array::unsafe_weak_copy(w),
+                      scales = array::unsafe_weak_copy(scales),
+                      transpose_ = transpose_]() mutable {
+      mxfp4_qmm_dispatch(out, x, w, scales, transpose_);
+    });
+  }
 }

 void GatherQMM::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 6);
-
  auto& x_pre = inputs[0];
  auto& w_pre = inputs[1];
  auto& scales_pre = inputs[2];
-  auto& biases_pre = inputs[3];
-  auto& lhs_indices = inputs[4];
-  auto& rhs_indices = inputs[5];
+  auto& lhs_indices = inputs[inputs.size() - 2];
+  auto& rhs_indices = inputs[inputs.size() - 1];

-  std::vector<array> temps;
+  auto& encoder = cpu::get_command_encoder(stream());
  auto ensure_row_contiguous_last_dims = [s = stream(),
-                                          &temps](const array& arr) {
+                                          &encoder](const array& arr) {
    auto stride_0 = arr.strides()[arr.ndim() - 2];
    auto stride_1 = arr.strides()[arr.ndim() - 1];
    if (stride_0 == arr.shape(-1) && stride_1 == 1) {
      return arr;
    } else {
-      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy(arr, temps.back(), CopyType::General, s);
-      return temps.back();
+      auto arr_cpy = array(arr.shape(), arr.dtype(), nullptr, {});
+      copy_cpu(arr, arr_cpy, CopyType::General, s);
+      encoder.add_temporary(arr_cpy);
+      return arr_cpy;
    }
  };

  auto x = ensure_row_contiguous_last_dims(x_pre);
  auto w = ensure_row_contiguous_last_dims(w_pre);
  auto scales = ensure_row_contiguous_last_dims(scales_pre);
-  auto biases = ensure_row_contiguous_last_dims(biases_pre);

  out.set_data(allocator::malloc(out.nbytes()));

-  auto& encoder = cpu::get_command_encoder(stream());
-  encoder.add_temporaries(std::move(temps));
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(scales);
-  encoder.set_input_array(biases);
  encoder.set_input_array(lhs_indices);
  encoder.set_input_array(rhs_indices);
  encoder.set_output_array(out);
-  encoder.dispatch([out = array::unsafe_weak_copy(out),
-                    x = array::unsafe_weak_copy(x),
-                    w = array::unsafe_weak_copy(w),
-                    scales = array::unsafe_weak_copy(scales),
-                    biases = array::unsafe_weak_copy(biases),
-                    lhs_indices = array::unsafe_weak_copy(lhs_indices),
-                    rhs_indices = array::unsafe_weak_copy(rhs_indices),
-                    group_size_ = group_size_,
-                    bits_ = bits_,
-                    transpose_ = transpose_]() mutable {
-    _bs_qmm_dispatch(
-        out,
-        x,
-        w,
-        scales,
-        biases,
-        lhs_indices,
-        rhs_indices,
-        group_size_,
-        bits_,
-        transpose_);
-  });
+  if (mode_ == QuantizationMode::Affine) {
+    auto biases = ensure_row_contiguous_last_dims(inputs[3]);
+    encoder.set_input_array(biases);
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
+                      x = array::unsafe_weak_copy(x),
+                      w = array::unsafe_weak_copy(w),
+                      scales = array::unsafe_weak_copy(scales),
+                      biases = array::unsafe_weak_copy(biases),
+                      lhs_indices = array::unsafe_weak_copy(lhs_indices),
+                      rhs_indices = array::unsafe_weak_copy(rhs_indices),
+                      group_size_ = group_size_,
+                      bits_ = bits_,
+                      transpose_ = transpose_]() mutable {
+      _bs_qmm_dispatch(
+          out,
+          x,
+          w,
+          scales,
+          biases,
+          lhs_indices,
+          rhs_indices,
+          group_size_,
+          bits_,
+          transpose_);
+    });
+  } else {
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
+                      x = array::unsafe_weak_copy(x),
+                      w = array::unsafe_weak_copy(w),
+                      scales = array::unsafe_weak_copy(scales),
+                      lhs_indices = array::unsafe_weak_copy(lhs_indices),
+                      rhs_indices = array::unsafe_weak_copy(rhs_indices),
+                      transpose_ = transpose_]() mutable {
+      mxfp4_bs_qmm_dispatch(
+          out, x, w, scales, lhs_indices, rhs_indices, transpose_);
+    });
+  }
 }

 template <typename T, typename U>
@@ -613,9 +970,8 @@ void quantize(
  float eps = 1e-7;

  bool power_of_2_bits = is_power_of_2(bits);
-  int el_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;
-  // For 3/6 bits we read 3 uint8s at a time instead of 1 uint32
-  int bytes_per_pack = power_of_2_bits ? 1 : 3;
+  int el_per_int = get_pack_factor(bits, 32);
+  int bytes_per_pack = get_bytes_per_pack(bits);
  int int_per_group = group_size * bytes_per_pack / el_per_int;
  size_t n_groups = w_size / group_size;

@@ -640,15 +996,21 @@ void quantize(
    }
    size_t out_idx = i * int_per_group;
    for (int j = 0; j < int_per_group / bytes_per_pack; ++j) {
-      uint32_t out_el = 0;
+      uint64_t out_el = 0;
      for (int k = 0; k < el_per_int; ++k) {
        float w_el = w[w_idx + j * el_per_int + k];
        w_el = std::rint((w_el - bias) / scale);
        w_el = std::min(std::max(w_el, 0.0f), n_bins);
-        out_el |= static_cast<uint32_t>(w_el) << (k * bits);
+        out_el |= static_cast<uint64_t>(w_el) << (k * bits);
      }
      if (power_of_2_bits) {
        out[out_idx + j] = out_el;
+      } else if (bits == 5) {
+        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
+        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
+        out[out_idx + bytes_per_pack * j + 2] = (out_el & 0xff0000) >> 16;
+        out[out_idx + bytes_per_pack * j + 3] = (out_el & 0xff000000) >> 24;
+        out[out_idx + bytes_per_pack * j + 4] = (out_el & 0xff00000000) >> 32;
      } else {
        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
@@ -676,16 +1038,14 @@ void dispatch_quantize(
      w_ptr, out_ptr, scales_ptr, biases_ptr, bits, group_size, w.size());
 }

-void fast::AffineQuantize::eval_cpu(
+void fast::Quantize::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto ensure_row_contiguous = [s = stream()](const array& arr) {
    if (arr.flags().row_contiguous) {
      return std::make_pair(arr, false);
    } else {
-      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-      copy(arr, arr_copy, CopyType::General, s);
-      return std::make_pair(arr_copy, true);
+      return std::make_pair(contiguous_copy_cpu(arr, s), true);
    }
  };

@@ -737,7 +1097,7 @@ void fast::AffineQuantize::eval_cpu(
      }
    } else {
      throw std::runtime_error(
-          "[fast::AffineQuantize::eval_cpu] Only supports floating point inputs");
+          "[fast::Quantize::eval_cpu] Only supports floating point inputs");
    }
  });
 }
--- a/mlx/backend/cpu/reduce.cpp
+++ b/mlx/backend/cpu/reduce.cpp
@@ -325,7 +325,15 @@ struct MaxReduce {
  };

  template <int N, typename T>
-  T operator()(simd::Simd<T, N> x) {
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    return simd::max(x);
+  };
+
+  template <int N, typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    if (simd::any(x != x)) {
+      return static_cast<T>(NAN);
+    }
    return simd::max(x);
  };
 };
@@ -342,7 +350,15 @@ struct MinReduce {
  };

  template <int N, typename T>
-  T operator()(simd::Simd<T, N> x) {
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    return simd::min(x);
+  };
+
+  template <int N, typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    if (simd::any(x != x)) {
+      return static_cast<T>(NAN);
+    }
    return simd::min(x);
  };
 };
@@ -475,19 +491,27 @@ void Reduce::eval_cpu(const std::vector<array>& inputs, array& out) {
        switch (in.dtype()) {
          case bool_:
          case uint8:
+            reduce_dispatch_sum_prod<uint8_t>(in, out, reduce_type_, axes_);
+            break;
+          case uint16:
+            reduce_dispatch_sum_prod<uint16_t>(in, out, reduce_type_, axes_);
+            break;
+          case uint32:
+            reduce_dispatch_sum_prod<uint32_t>(in, out, reduce_type_, axes_);
+            break;
+          case uint64:
+            reduce_dispatch_sum_prod<uint64_t>(in, out, reduce_type_, axes_);
+            break;
          case int8:
            reduce_dispatch_sum_prod<int8_t>(in, out, reduce_type_, axes_);
            break;
          case int16:
-          case uint16:
            reduce_dispatch_sum_prod<int16_t>(in, out, reduce_type_, axes_);
            break;
          case int32:
-          case uint32:
            reduce_dispatch_sum_prod<int32_t>(in, out, reduce_type_, axes_);
            break;
          case int64:
-          case uint64:
            reduce_dispatch_sum_prod<int64_t>(in, out, reduce_type_, axes_);
            break;
          case float16:
@@ -527,10 +551,10 @@ void Reduce::eval_cpu(const std::vector<array>& inputs, array& out) {
            reduce_dispatch_min_max<uint64_t>(in, out, reduce_type_, axes_);
            break;
          case int8:
-            reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
+            reduce_dispatch_min_max<int8_t>(in, out, reduce_type_, axes_);
            break;
          case int16:
-            reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
+            reduce_dispatch_min_max<int16_t>(in, out, reduce_type_, axes_);
            break;
          case int32:
            reduce_dispatch_min_max<int32_t>(in, out, reduce_type_, axes_);
--- a/mlx/backend/cpu/scan.cpp
+++ b/mlx/backend/cpu/scan.cpp
@@ -250,10 +250,8 @@ void Scan::eval_cpu(const std::vector<array>& inputs, array& out) {
  // Ensure contiguity
  auto in = inputs[0];
  if (!in.flags().row_contiguous) {
-    array arr_copy(in.shape(), in.dtype(), nullptr, {});
-    copy(in, arr_copy, CopyType::General, stream());
-    in = arr_copy;
-    encoder.add_temporary(arr_copy);
+    in = contiguous_copy_cpu(in, stream());
+    encoder.add_temporary(in);
  }
  out.set_data(allocator::malloc(out.nbytes()));

--- a/mlx/backend/cpu/simd/accelerate_simd.h
+++ b/mlx/backend/cpu/simd/accelerate_simd.h
@@ -9,7 +9,7 @@

 #include "mlx/backend/cpu/simd/base_simd.h"

-// There seems to be a bug in sims/base.h
+// There seems to be a bug in simd/base_simd.h
 // __XROS_2_0 is not defined, the expression evaluates
 // to true instead of false setting the SIMD library
 // higher than it should be even on macOS < 15
@@ -234,6 +234,7 @@ Simd<T, N> remainder(Simd<T, N> a, Simd<T, N> b) {

 template <typename MaskT, typename T1, typename T2, int N>
 Simd<T1, N> select(Simd<MaskT, N> mask, Simd<T1, N> x, Simd<T2, N> y) {
+  static_assert(std::is_same_v<MaskT, bool>);
  if constexpr (sizeof(T1) == 1) {
    return asd::bitselect(y.value, x.value, asd::convert<char>(mask.value));
  } else if constexpr (sizeof(T1) == 2) {
@@ -251,9 +252,13 @@ Simd<T, N> pow(Simd<T, N> base, Simd<T, N> exp) {
    return asd::pow(base.value, exp.value);
  } else {
    Simd<T, N> res = 1;
-    while (any(exp)) {
-      res = select(exp & 1, res * base, res);
-      base = select(exp, base * base, base);
+    // Raising an integer to a negative power is undefined
+    if (any(exp < static_cast<T>(0))) {
+      return 0;
+    }
+    while (any(exp > static_cast<T>(0))) {
+      res = select((exp & 1) != 0, res * base, res);
+      base = select(exp > static_cast<T>(0), base * base, base);
      exp = exp >> 1;
    }
    return res;
--- a/mlx/backend/cpu/simd/math.h
+++ b/mlx/backend/cpu/simd/math.h
@@ -79,7 +79,8 @@ Simd<T, N> sincos(Simd<T, N> in) {

  // Get the polynom selection mask. There is one polynom for 0 <= x <= Pi/4
  // and another one for Pi/4<x<=Pi/2. Both branches will be computed.
-  auto poly_mask = (emm2 & 2) != 0;
+  auto poly_mask =
+      (emm2 & static_cast<uint32_t>(2)) != static_cast<uint32_t>(0);

  // The magic pass: "Extended precision modular arithmetic"
  // x = ((x - y * DP1) - y * DP2) - y * DP3
@@ -87,8 +88,8 @@ Simd<T, N> sincos(Simd<T, N> in) {
  x = fma(y, Simd<float, N>(-2.4187564849853515625e-4f), x);
  x = fma(y, Simd<float, N>(-3.77489497744594108e-8f), x);

-  sign_mask_sin = sign_mask_sin ^ ((emm2 & 4) != 0);
-  auto sign_mask_cos = ((emm2 - 2) & 4) != 0;
+  sign_mask_sin = sign_mask_sin ^ ((emm2 & 4) != static_cast<uint32_t>(0));
+  auto sign_mask_cos = ((emm2 - 2) & 4) != static_cast<uint32_t>(0);

  // Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
  // and the second polynom      (Pi/4 <= x <= 0) in y2
--- a/mlx/backend/cpu/softmax.cpp
+++ b/mlx/backend/cpu/softmax.cpp
@@ -131,8 +131,7 @@ void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
      }
      return x;
    } else {
-      array x_copy(x.shape(), x.dtype(), nullptr, {});
-      copy(x, x_copy, CopyType::General, s);
+      array x_copy = contiguous_copy_cpu(x, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
--- a/mlx/backend/cpu/sort.cpp
+++ b/mlx/backend/cpu/sort.cpp
@@ -8,13 +8,25 @@
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
-
+#include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"

 namespace mlx::core {

 namespace {

+// NaN-aware comparator that places NaNs at the end
+template <typename T>
+bool nan_aware_less(T a, T b) {
+  if constexpr (std::is_floating_point_v<T> || std::is_same_v<T, complex64_t>) {
+    if (std::isnan(a))
+      return false;
+    if (std::isnan(b))
+      return true;
+  }
+  return a < b;
+}
+
 template <typename T>
 struct StridedIterator {
  using iterator_category = std::random_access_iterator_tag;
@@ -27,7 +39,7 @@ struct StridedIterator {
  StridedIterator() = default;

  explicit StridedIterator(T* ptr, int64_t stride, difference_type offset = 0)
-      : ptr_(ptr + offset * stride), stride_(stride) {}
+      : stride_(stride), ptr_(ptr + offset * stride) {}

  explicit StridedIterator(array& arr, int axis, difference_type offset = 0)
      : StridedIterator(arr.data<T>(), arr.strides()[axis], offset) {}
@@ -108,8 +120,8 @@ template <typename T>
 void sort(array& out, int axis) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + out.ndim() : axis;
-  size_t in_size = out.size();
-  size_t n_rows = in_size / out.shape(axis);
+  int64_t in_size = out.size();
+  int64_t n_rows = in_size / out.shape(axis);

  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);
@@ -124,13 +136,13 @@ void sort(array& out, int axis) {
  ContiguousIterator src_it(
      remaining_shape, remaining_strides, remaining_shape.size());
  auto out_ptr = out.data<T>();
-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    T* data_ptr = out_ptr + src_it.loc;

    StridedIterator st(data_ptr, axis_stride, 0);
    StridedIterator ed(data_ptr, axis_stride, axis_size);

-    std::stable_sort(st, ed);
+    std::stable_sort(st, ed, nan_aware_less<T>);
    src_it.step();
  }
 }
@@ -139,7 +151,7 @@ template <typename T, typename IdxT = uint32_t>
 void argsort(const array& in, array& out, int axis) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + in.ndim() : axis;
-  size_t n_rows = in.size() / in.shape(axis);
+  int64_t n_rows = in.size() / in.shape(axis);

  auto in_remaining_shape = in.shape();
  in_remaining_shape.erase(in_remaining_shape.begin() + axis);
@@ -164,7 +176,7 @@ void argsort(const array& in, array& out, int axis) {
      out_remaining_shape, out_remaining_strides, out_remaining_shape.size());
  auto in_ptr = in.data<T>();
  auto out_ptr = out.data<IdxT>();
-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    const T* data_ptr = in_ptr + in_it.loc;
    IdxT* idx_ptr = out_ptr + out_it.loc;

@@ -184,6 +196,15 @@ void argsort(const array& in, array& out, int axis) {
    std::stable_sort(st, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
      auto v1 = data_ptr[a * in_stride];
      auto v2 = data_ptr[b * in_stride];
+
+      // Handle NaNs (place them at the end)
+      if (std::is_floating_point<T>::value) {
+        if (std::isnan(v1))
+          return false;
+        if (std::isnan(v2))
+          return true;
+      }
+
      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
@@ -193,8 +214,8 @@ template <typename T>
 void partition(array& out, int axis, int kth) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + out.ndim() : axis;
-  size_t in_size = out.size();
-  size_t n_rows = in_size / out.shape(axis);
+  int64_t in_size = out.size();
+  int64_t n_rows = in_size / out.shape(axis);

  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);
@@ -211,7 +232,7 @@ void partition(array& out, int axis, int kth) {
  ContiguousIterator src_it(
      remaining_shape, remaining_strides, remaining_shape.size());
  auto out_ptr = out.data<T>();
-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    T* data_ptr = out_ptr + src_it.loc;
    src_it.step();

@@ -219,7 +240,7 @@ void partition(array& out, int axis, int kth) {
    StridedIterator md(data_ptr, axis_stride, kth);
    StridedIterator ed(data_ptr, axis_stride, axis_size);

-    std::nth_element(st, md, ed);
+    std::nth_element(st, md, ed, nan_aware_less<T>);
  }
 }

@@ -227,7 +248,7 @@ template <typename T, typename IdxT = uint32_t>
 void argpartition(const array& in, array& out, int axis, int kth) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + in.ndim() : axis;
-  size_t n_rows = in.size() / in.shape(axis);
+  int64_t n_rows = in.size() / in.shape(axis);

  auto in_remaining_shape = in.shape();
  in_remaining_shape.erase(in_remaining_shape.begin() + axis);
@@ -256,7 +277,7 @@ void argpartition(const array& in, array& out, int axis, int kth) {
  auto in_ptr = in.data<T>();
  auto out_ptr = out.data<IdxT>();

-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    const T* data_ptr = in_ptr + in_it.loc;
    IdxT* idx_ptr = out_ptr + out_it.loc;
    in_it.step();
@@ -276,6 +297,15 @@ void argpartition(const array& in, array& out, int axis, int kth) {
    std::nth_element(st, md, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
      auto v1 = data_ptr[a * in_stride];
      auto v2 = data_ptr[b * in_stride];
+
+      // Handle NaNs (place them at the end)
+      if (std::is_floating_point<T>::value) {
+        if (std::isnan(v1))
+          return false;
+        if (std::isnan(v2))
+          return true;
+      }
+
      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
@@ -333,45 +363,24 @@ void Sort::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

+  int axis = axis_;
+  if (axis < 0) {
+    axis += in.ndim();
+  }
+
  // Copy input to output
-  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
-  copy(in, out, ctype, stream());
+  CopyType ctype = (in.flags().contiguous && in.strides()[axis] != 0)
+      ? CopyType::Vector
+      : CopyType::General;
+  copy_cpu(in, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_output_array(out);
-  encoder.dispatch(
-      [out = array::unsafe_weak_copy(out), axis_ = axis_]() mutable {
-        switch (out.dtype()) {
-          case bool_:
-            return sort<bool>(out, axis_);
-          case uint8:
-            return sort<uint8_t>(out, axis_);
-          case uint16:
-            return sort<uint16_t>(out, axis_);
-          case uint32:
-            return sort<uint32_t>(out, axis_);
-          case uint64:
-            return sort<uint64_t>(out, axis_);
-          case int8:
-            return sort<int8_t>(out, axis_);
-          case int16:
-            return sort<int16_t>(out, axis_);
-          case int32:
-            return sort<int32_t>(out, axis_);
-          case int64:
-            return sort<int64_t>(out, axis_);
-          case float32:
-            return sort<float>(out, axis_);
-          case float64:
-            return sort<double>(out, axis_);
-          case float16:
-            return sort<float16_t>(out, axis_);
-          case bfloat16:
-            return sort<bfloat16_t>(out, axis_);
-          case complex64:
-            return sort<complex64_t>(out, axis_);
-        }
-      });
+  encoder.dispatch([out = array::unsafe_weak_copy(out), axis]() mutable {
+    dispatch_all_types(out.dtype(), [&](auto type_tag) {
+      sort<MLX_GET_TYPE(type_tag)>(out, axis);
+    });
+  });
 }

 void ArgPartition::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -426,8 +435,10 @@ void Partition::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];

  // Copy input to output
-  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
-  copy(in, out, ctype, stream());
+  CopyType ctype = (in.flags().contiguous && in.strides()[axis_] != 0)
+      ? CopyType::Vector
+      : CopyType::General;
+  copy_cpu(in, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_output_array(out);
--- a/mlx/backend/cpu/svd.cpp
+++ b/mlx/backend/cpu/svd.cpp
@@ -27,11 +27,11 @@ void svd_impl(
  const int N = a.shape(-1);
  const int K = std::min(M, N);

-  size_t num_matrices = a.size() / (M * N);
+  int64_t num_matrices = a.size() / (M * N);

  // lapack clobbers the input, so we have to make a copy.
  array in(a.shape(), a.dtype(), nullptr, {});
-  copy(
+  copy_cpu(
      a,
      in,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
@@ -81,40 +81,26 @@ void svd_impl(
    // Vᵀ of shape N x N. (M x M in lapack).
    const int ldvt = M;

-    auto job_u = (u_ptr) ? "V" : "N";
-    auto job_vt = (u_ptr) ? "V" : "N";
-    static constexpr auto range = "A";
+    auto jobz = (u_ptr) ? "A" : "N";

-    // Will contain the number of singular values after the call has returned.
-    int ns = 0;
    T workspace_dimension = 0;

    // Will contain the indices of eigenvectors that failed to converge (not
    // used here but required by lapack).
-    auto iwork = array::Data{allocator::malloc(sizeof(int) * 12 * K)};
+    auto iwork = array::Data{allocator::malloc(sizeof(int) * 8 * K)};

    static const int lwork_query = -1;

-    static const int ignored_int = 0;
-    static const T ignored_float = 0;
-
    int info;

    // Compute workspace size.
-    gesvdx<T>(
-        /* jobu = */ job_u,
-        /* jobvt = */ job_vt,
-        /* range = */ range,
+    gesdd<T>(
+        /* jobz = */ jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ nullptr,
        /* lda = */ &lda,
-        /* vl = */ &ignored_float,
-        /* vu = */ &ignored_float,
-        /* il = */ &ignored_int,
-        /* iu = */ &ignored_int,
-        /* ns = */ &ns,
        /* s = */ nullptr,
        /* u = */ nullptr,
        /* ldu = */ &ldu,
@@ -135,21 +121,14 @@ void svd_impl(
    auto scratch = array::Data{allocator::malloc(sizeof(T) * lwork)};

    // Loop over matrices.
-    for (int i = 0; i < num_matrices; i++) {
-      gesvdx<T>(
-          /* jobu = */ job_u,
-          /* jobvt = */ job_vt,
-          /* range = */ range,
+    for (int64_t i = 0; i < num_matrices; i++) {
+      gesdd<T>(
+          /* jobz = */ jobz,
          // M and N are swapped since lapack expects column-major.
          /* m = */ &N,
          /* n = */ &M,
          /* a = */ in_ptr + M * N * i,
          /* lda = */ &lda,
-          /* vl = */ &ignored_float,
-          /* vu = */ &ignored_float,
-          /* il = */ &ignored_int,
-          /* iu = */ &ignored_int,
-          /* ns = */ &ns,
          /* s = */ s_ptr + K * i,
          // According to the identity above, lapack will write Vᵀᵀ as U.
          /* u = */ vt_ptr ? vt_ptr + N * N * i : nullptr,
@@ -167,13 +146,6 @@ void svd_impl(
        ss << "svd_impl: sgesvdx_ failed with code " << info;
        throw std::runtime_error(ss.str());
      }
-
-      if (ns != K) {
-        std::stringstream ss;
-        ss << "svd_impl: expected " << K << " singular values, but " << ns
-           << " were computed.";
-        throw std::runtime_error(ss.str());
-      }
    }
  });
  encoder.add_temporary(in);
@@ -181,10 +153,10 @@ void svd_impl(

 template <typename T>
 void compute_svd(
-    const array& a,
-    bool compute_uv,
-    std::vector<array>& outputs,
-    Stream stream) {}
+    const array& /* a */,
+    bool /* compute_uv */,
+    std::vector<array>& /* outputs */,
+    Stream /* stream */) {}

 void SVD::eval_cpu(
    const std::vector<array>& inputs,
--- a/mlx/backend/cpu/ternary.h
+++ b/mlx/backend/cpu/ternary.h
@@ -136,7 +136,7 @@ void ternary_op(
  if (topt == TernaryOpType::ScalarScalarScalar) {
    *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
  } else if (topt == TernaryOpType::VectorVectorVector) {
-    for (size_t i = 0; i < out.size(); ++i) {
+    for (int64_t i = 0; i < out.size(); ++i) {
      *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
      a_ptr++;
      b_ptr++;
--- a/mlx/backend/cpu/unary.h
+++ b/mlx/backend/cpu/unary.h
@@ -2,35 +2,16 @@

 #pragma once

-#include "mlx/allocator.h"
-#include "mlx/array.h"
-#include "mlx/backend/common/utils.h"
+#include "mlx/backend/common/unary.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"
 #include "mlx/utils.h"

 namespace mlx::core {

-void set_unary_output_data(const array& in, array& out) {
-  if (in.flags().contiguous) {
-    if (is_donatable(in, out)) {
-      out.copy_shared_buffer(in);
-    } else {
-      auto size = in.data_size();
-      out.set_data(
-          allocator::malloc(size * out.itemsize()),
-          size,
-          in.strides(),
-          in.flags());
-    }
-  } else {
-    out.set_data(allocator::malloc(out.nbytes()));
-  }
-}
-
 template <typename T, typename U = T, typename Op>
-void unary_op(const T* a, U* out, size_t shape, size_t stride) {
-  for (size_t i = 0; i < shape; i += 1) {
+void unary_op(const T* a, U* out, int64_t shape, int64_t stride) {
+  for (int64_t i = 0; i < shape; i += 1) {
    out[i] = Op{}(*a);
    a += stride;
  }
@@ -57,14 +38,14 @@ void unary_op(const array& a, array& out, Op) {
      src++;
    }
  } else {
-    size_t shape = ndim > 0 ? a.shape().back() : 1;
-    size_t stride = ndim > 0 ? a.strides().back() : 1;
+    int64_t shape = ndim > 0 ? a.shape().back() : 1;
+    int64_t stride = ndim > 0 ? a.strides().back() : 1;
    if (ndim <= 1) {
      unary_op<T, U, Op>(src, dst, shape, stride);
      return;
    }
    auto it = ContiguousIterator(a.shape(), a.strides(), ndim - 1);
-    for (size_t elem = 0; elem < a.size(); elem += shape) {
+    for (int64_t elem = 0; elem < a.size(); elem += shape) {
      unary_op<T, U, Op>(src + it.loc, dst + elem, shape, stride);
      it.step();
    }
--- a/mlx/backend/cpu/unary_ops.h
+++ b/mlx/backend/cpu/unary_ops.h
@@ -77,7 +77,8 @@ struct Real {
 struct Sigmoid {
  template <int N, typename T>
  Simd<T, N> operator()(Simd<T, N> x) {
-    return 1.0f / (1.0f + simd::exp(-x));
+    auto y = 1.0f / (1.0f + simd::exp(simd::abs(x)));
+    return simd::select(x < Simd<T, N>{0}, y, Simd<T, N>{1} - y);
  }
  SINGLE()
 };
--- a/Show More
+++ b/Show More