Fix four step fft

Add single kernel bluestein
Refactored four-step
2025-12-16 01:49:05 +08:00 · 2025-05-08 14:14:59 -07:00 · 2025-05-08 13:23:11 -07:00 · 2025-05-08 13:23:11 -07:00 · 2025-05-08 13:23:11 -07:00 · 2025-05-08 13:23:11 -07:00
280 changed files with 5463 additions and 19019 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7,6 +7,15 @@ parameters:
  nightly_build:
    type: boolean
    default: false
+  weekly_build:
+    type: boolean
+    default: false
+  test_release:
+    type: boolean
+    default: false
+  linux_release:
+    type: boolean
+    default: false

 jobs:
  build_documentation:
@@ -29,7 +38,7 @@ jobs:
            pip install --upgrade pip
            pip install --upgrade cmake
            pip install -r docs/requirements.txt
-            pip install . -v
+            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install . -v
      - when:
          condition:
            not: << parameters.upload-docs >>
@@ -61,9 +70,9 @@ jobs:
                 git push -f origin gh-pages

  linux_build_and_test:
-    machine:
-      image: ubuntu-2204:current
-      resource_class: large
+    docker:
+      - image: cimg/python:3.9
+
    steps:
      - checkout
      - run:
@@ -75,34 +84,37 @@ jobs:
      - run:
          name: Install dependencies
          command: |
-            export DEBIAN_FRONTEND=noninteractive
-            export NEEDRESTART_MODE=a
-            sudo apt-get update
-            sudo apt-get upgrade -y
            pip install --upgrade cmake
-            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            pip install nanobind==2.4.0
+            pip install numpy
+            sudo apt-get update
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
            sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
      - run:
          name: Install Python package
          command: |
-            pip install -e ".[dev]"
+            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              python3 setup.py build_ext --inplace
+            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              python3 setup.py develop
      - run:
          name: Generate package stubs
          command: |
            echo "stubs"
            pip install typing_extensions
-            python setup.py generate_stubs
+            python setup.py generate_stubs 
      - run:
          name: Run Python tests
          command: |
-            python -m unittest discover python/tests -v
+            python3 -m unittest discover python/tests -v
            mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
-            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
-            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py
      - run:
          name: Build CPP only
          command: |
-            mkdir -p build && cd build
+            mkdir -p build && cd build 
            cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
            make -j `nproc`
      - run:
@@ -142,14 +154,15 @@ jobs:
          name: Install Python package
          command: |
            source env/bin/activate
-            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+            DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+            CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
              pip install -e . -v
      - run:
          name: Generate package stubs
          command: |
            source env/bin/activate
            pip install typing_extensions
-            python setup.py generate_stubs
+            python setup.py generate_stubs 
      - run:
          name: Run Python tests
          command: |
@@ -157,8 +170,7 @@ jobs:
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
-            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
-            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py
      - run:
          name: Build example extension
          command: |
@@ -193,34 +205,13 @@ jobs:
          name: Run Python tests with JIT
          command: |
            source env/bin/activate
-            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+              CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
              pip install -e . -v
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
              METAL_DEBUG_ERROR_MODE=0 \
              python -m xmlrunner discover -v python/tests -o test-results/gpu_jit

-  cuda_build_and_test:
-    machine:
-      image: linux-cuda-12:default
-      resource_class: gpu.nvidia.small.gen2
-    steps:
-      - checkout
-      - run:
-          name: Install Python package
-          command: |
-            sudo apt-get update
-            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
-            python -m venv env
-            source env/bin/activate
-            CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
-              pip install -e ".[dev]"
-      - run:
-          name: Run Python tests
-          command: |
-            source env/bin/activate
-            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
-            LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
-
  build_release:
    parameters:
      python_version:
@@ -261,28 +252,21 @@ jobs:
          command: |
            source env/bin/activate
            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
+              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
              pip install . -v
      - run:
          name: Generate package stubs
          command: |
            source env/bin/activate
            pip install typing_extensions
-            python setup.py generate_stubs
+            python setup.py generate_stubs 
      - run:
          name: Build Python package
          command: |
            source env/bin/activate
-            << parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
-      - when:
-          condition:
-            equal: ["3.9", << parameters.python_version >>]
-          steps:
-            - run:
-                name: Build common package
-                command: |
-                  source env/bin/activate
-                  python setup.py clean --all
-                  << parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
+            << parameters.build_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+              python -m build -w
      - when:
          condition: << parameters.build_env >>
          steps:
@@ -299,99 +283,52 @@ jobs:
      python_version:
        type: string
        default: "3.9"
-      build_env:
+      extra_env:
        type: string
-        default: ""
-    machine:
-      image: ubuntu-2204:current
-      resource_class: large
+        default: "DEV_RELEASE=1"
+    docker:
+      - image: ubuntu:20.04
    steps:
      - checkout
      - run:
          name: Build wheel
          command: |
            PYTHON=python<< parameters.python_version >>
-            export DEBIAN_FRONTEND=noninteractive
-            export NEEDRESTART_MODE=a
-            sudo apt-get update
-            sudo apt-get upgrade -y
-            TZ=Etc/UTC sudo apt-get -y install tzdata
-            sudo apt-get install -y apt-utils
-            sudo apt-get install -y software-properties-common
-            sudo add-apt-repository -y ppa:deadsnakes/ppa
-            sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
-            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
-            sudo apt-get install -y build-essential git
+            apt-get update
+            apt-get upgrade -y
+            DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
+            apt-get install -y apt-utils
+            apt-get install -y software-properties-common
+            add-apt-repository -y ppa:deadsnakes/ppa
+            apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
+            apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            apt-get install -y build-essential git
            $PYTHON -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
+            pip install nanobind==2.4.0
+            pip install --upgrade setuptools
+            pip install numpy
            pip install auditwheel
            pip install patchelf
            pip install build
            pip install twine
-            << parameters.build_env >> pip install ".[dev]" -v
+            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              pip install . -v
            pip install typing_extensions
-            python setup.py generate_stubs
-            MLX_BUILD_STAGE=1 << parameters.build_env >> python -m build -w
-            bash python/scripts/repair_linux.sh
-      - when:
-          condition:
-            equal: ["3.9", << parameters.python_version >>]
-          steps:
-            - run:
-                name: Build common package
-                command: |
-                  source env/bin/activate
-                  python setup.py clean --all
-                  << parameters.build_env >> MLX_BUILD_STAGE=2 \
-                    python -m build -w
-                  auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
-      - when:
-          condition: << parameters.build_env >>
-          steps:
-            - run:
-                name: Upload packages
-                command: |
-                  source env/bin/activate
-                  twine upload wheelhouse/*.whl
-      - store_artifacts:
-          path: wheelhouse/
-
-  build_cuda_release:
-    parameters:
-      build_env:
-        type: string
-        default: ""
-    machine:
-      image: linux-cuda-12:default
-      resource_class: gpu.nvidia.small.gen2
-    steps:
-      - checkout
+            python setup.py generate_stubs 
+            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              python -m build --wheel
+            auditwheel show dist/*
+            auditwheel repair dist/* --plat manylinux_2_31_x86_64
      - run:
-          name: Build wheel
+          name: Upload package
          command: |
-            sudo apt-get update
-            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
-            sudo apt-get install zip
-            python -m venv env
            source env/bin/activate
-            pip install auditwheel
-            pip install patchelf
-            pip install build
-            pip install twine
-            << parameters.build_env >> MLX_BUILD_STAGE=2 \
-              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
-              python -m build -w
-            bash python/scripts/repair_cuda.sh
-      - when:
-          condition: << parameters.build_env >>
-          steps:
-            - run:
-                name: Upload package
-                command: |
-                  source env/bin/activate
-                  twine upload wheelhouse/*.whl
+            twine upload wheelhouse/*
      - store_artifacts:
          path: wheelhouse/

@@ -403,19 +340,22 @@ workflows:
            pattern: "^(?!pull/)[-\\w]+$"
            value: << pipeline.git.branch >>
        - not: << pipeline.parameters.nightly_build >>
+        - not: << pipeline.parameters.weekly_build >>
+        - not: << pipeline.parameters.test_release >>
    jobs:
      - mac_build_and_test:
          matrix:
            parameters:
              macosx_deployment_target: ["13.5", "14.0"]
      - linux_build_and_test
-      - cuda_build_and_test 
      - build_documentation 

  build_pypi_release:
    when:
      and:
        - not: << pipeline.parameters.nightly_build >>
+        - not: << pipeline.parameters.weekly_build >>
+        - not: << pipeline.parameters.test_release >>
    jobs:
      - build_release:
          filters:
@@ -497,25 +437,6 @@ workflows:
            branches:
              ignore: /.*/
          upload-docs: true
-      - build_linux_release:
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-          matrix:
-            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              build_env: ["PYPI_RELEASE=1"]
-      - build_cuda_release:
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-          matrix:
-            parameters:
-              build_env: ["PYPI_RELEASE=1"]

  prb:
    when:
@@ -534,8 +455,6 @@ workflows:
              macosx_deployment_target: ["13.5", "14.0"]
      - linux_build_and_test:
          requires: [ hold ]
-      - cuda_build_and_test:
-          requires: [ hold ]
  nightly_build:
    when:
      and:
@@ -594,8 +513,88 @@ workflows:
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.13"
+  weekly_build:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.weekly_build >>
+    jobs:
+      - build_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              build_env: ["DEV_RELEASE=1"]
+              xcode_version: ["16.2.0", "15.0.0"]
+            exclude:
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.9"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.10"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.11"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.12"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.13"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+                build_env: "DEV_RELEASE=1"
+  linux_test_release:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.linux_release >>
+    jobs:
      - build_linux_release:
          matrix:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-      - build_cuda_release
+              extra_env: ["PYPI_RELEASE=1"]
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -19,7 +19,6 @@ MLX was developed with contributions from the following individuals:
 - Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.
 - Paul Paczuski: Improved stability of BCE loss calculation
 - Max-Heinrich Laves: Added `conv_transpose1d`, `conv_transpose2d`, and `conv_transpose3d` ops.
- Gökdeniz Gülmez: Added the `Muon (MomentUm Orthogonalized by Newton-schulz)` optimizer.

 <a href="https://github.com/ml-explore/mlx/graphs/contributors">
  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,8 +64,10 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
      message(WARNING "Building for x86_64 arch is not officially supported.")
    endif()
  endif()
+
 else()
  set(MLX_BUILD_METAL OFF)
+  message(WARNING "MLX is prioritised for Apple silicon systems using macOS.")
 endif()

 # ----------------------------- Lib -----------------------------
@@ -229,9 +231,6 @@ target_include_directories(
  mlx PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
             $<INSTALL_INTERFACE:include>)

-# Do not add mlx_EXPORTS define for shared library.
-set_target_properties(mlx PROPERTIES DEFINE_SYMBOL "")
-
 FetchContent_Declare(
  fmt
  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
--- a/benchmarks/cpp/irregular_strides.cpp
+++ b/benchmarks/cpp/irregular_strides.cpp
@@ -1,6 +1,5 @@
 // Copyright © 2023 Apple Inc.

-#include <cstring>
 #include <iostream>
 #include <sstream>

--- a/benchmarks/cpp/single_ops.cpp
+++ b/benchmarks/cpp/single_ops.cpp
@@ -192,22 +192,6 @@ void time_reductions() {

  auto argmin_along_1 = [&a]() { return mx::argmin(a, 1, false); };
  TIME(argmin_along_1);
-
-  auto indices = mx::array({1});
-  auto updates = mx::reshape(mx::array({NAN}), {1, 1, 1});
-  std::vector<int> axes{0};
-  auto b = scatter(a, {indices}, updates, axes);
-  mx::eval(b);
-
-  auto max_along_0 = [&b]() { return mx::max(b, 0, false); };
-  TIME(max_along_0);
-  auto max_along_1 = [&b]() { return mx::max(b, 1, false); };
-  TIME(max_along_1);
-
-  auto min_along_0 = [&b]() { return mx::min(b, 0, false); };
-  TIME(min_along_0);
-  auto min_along_1 = [&b]() { return mx::min(b, 1, false); };
-  TIME(min_along_1);
 }

 void time_gather_scatter() {
--- a/benchmarks/python/comparative/bench_torch.py
+++ b/benchmarks/python/comparative/bench_torch.py
@@ -5,7 +5,6 @@ import os
 import time

 import torch
-import torch.cuda
 import torch.mps


@@ -45,10 +44,8 @@ def bench(f, *args):


 def sync_if_needed(x):
-    if x.device == torch.device("mps"):
+    if x.device != torch.device("cpu"):
        torch.mps.synchronize()
-    elif x.device == torch.device("cuda"):
-        torch.cuda.synchronize()


@torch.no_grad()
@@ -102,14 +99,6 @@ def reduction(op, axis, x):
    sync_if_needed(x)


-@torch.no_grad()
-def sum_and_add(axis, x, y):
-    z = x.sum(axis=axis, keepdims=True)
-    for i in range(50):
-        z = (z + y).sum(axis=axis, keepdims=True)
-    sync_if_needed(x)
-
-
@torch.no_grad()
 def softmax(axis, x):
    ys = []
@@ -351,11 +340,7 @@ if __name__ == "__main__":
        args.axis.pop(0)

    torch.set_num_threads(1)
-    device = "mps"
-    if torch.cuda.is_available():
-        device = "cuda"
-    if args.cpu:
-        device = "cpu"
+    device = "cpu" if args.cpu else "mps"

    types = args.dtype
    if not types:
@@ -475,8 +460,5 @@ if __name__ == "__main__":
    elif args.benchmark == "selu":
        print(bench(selu, x))

-    elif args.benchmark == "sum_and_add":
-        print(bench(sum_and_add, axis, *xs))
-
    else:
        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")
--- a/benchmarks/python/conv_unaligned_bench.py
+++ b/benchmarks/python/conv_unaligned_bench.py
@@ -1,107 +0,0 @@
-import math
-import time
-
-import mlx.core as mx
-import numpy as np
-import torch
-
-N_warmup = 10
-N_iter_bench = 100
-N_iter_func = 5
-
-
-def bench(f, a, b):
-    for i in range(N_warmup):
-        f(a, b)
-    torch.mps.synchronize()
-
-    s = time.perf_counter_ns()
-    for i in range(N_iter_bench):
-        f(a, b)
-    e = time.perf_counter_ns()
-    return (e - s) * 1e-9
-
-
-def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
-    def mx_conv_2D(a, b):
-        ys = []
-        for i in range(N_iter_func):
-            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
-            ys.append(y)
-        mx.eval(ys)
-        return ys
-
-    return mx_conv_2D
-
-
-def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
-    @torch.no_grad()
-    def pt_conv_2D(a, b):
-        ys = []
-        for i in range(N_iter_func):
-            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
-            ys.append(y)
-        torch.mps.synchronize()
-        return ys
-
-    return pt_conv_2D
-
-
-def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
-    scale = 1.0 / math.sqrt(kH * kH * C)
-    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
-    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
-        np_dtype
-    )
-
-    a_mx = mx.array(a_np)
-    b_mx = mx.array(b_np)
-
-    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
-    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("mps")
-
-    torch.mps.synchronize()
-
-    f_mx = make_mx_conv_2D(strides, padding, groups)
-    f_pt = make_pt_conv_2D(strides, padding, groups)
-
-    time_torch = bench(f_pt, a_pt, b_pt)
-    time_mlx = bench(f_mx, a_mx, b_mx)
-
-    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
-    out_pt = torch.conv2d(
-        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
-    )
-    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
-    out_pt = out_pt.numpy(force=True)
-
-    atol = 2e-5 if np_dtype == np.float32 else 1e-4
-
-    if not np.allclose(out_pt, out_mx, atol=atol):
-        print(
-            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
-        )
-
-    return time_mlx, time_torch
-
-
-if __name__ == "__main__":
-    dtype = "float32"
-    shapes = (
-        (4, 32, 32, 21, 3, 3, 128),
-        (4, 32, 32, 21, 3, 3, 37),
-        (4, 32, 32, 370, 3, 3, 370),
-        (4, 32, 32, 370, 7, 7, 128),
-        (2, 320, 640, 21, 7, 7, 21),
-    )
-    for N, H, W, C, kh, kw, O in shapes:
-        time_mlx, time_torch = bench_shape(
-            N, H, W, C, kh, kw, O, (1, 1), (0, 0), 1, dtype
-        )
-        diff = time_torch / time_mlx - 1.0
-
-        print(
-            f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kh:2d}, {kw:2d}, {C:3d}), {dtype}, {100. * diff:+5.2f}%"
-        )
-        if time_mlx >= 2.0 * time_torch:
-            print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/layer_norm_bench.py
+++ b/benchmarks/python/layer_norm_bench.py
@@ -1,7 +1,5 @@
 # Copyright © 2023-2024 Apple Inc.

-from functools import partial
-
 import mlx.core as mx
 import mlx.nn as nn
 from time_utils import time_fn
@@ -20,63 +18,51 @@ def layer_norm(x, w, b, eps):
    return y


-def time_layer_norm(N, dt):
-    L = 1024
+def time_layer_norm():
    f1 = lambda x, w, b, y: (layer_norm(x, w, b, 1e-5) * y).sum()
    f2 = lambda x, w, b, y: (mx.fast.layer_norm(x, w, b, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0, 1, 2))
    g2 = mx.grad(f2, argnums=(0, 1, 2))

-    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
-    w = mx.random.uniform(shape=(N,)).astype(dt)
-    b = mx.random.uniform(shape=(N,)).astype(dt)
-    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
    mx.eval(x, w, b, y)

-    def layer_norm_loop(f, x, w, b):
-        for _ in range(32):
-            x = f(x, w, b)
-        return x
-
-    time_fn(layer_norm_loop, partial(layer_norm, eps=1e-5), x, w, b)
-    time_fn(layer_norm_loop, partial(mx.fast.layer_norm, eps=1e-5), x, w, b)
-
-    def layer_norm_grad_loop(g, x, w, b):
+    def layer_norm_loop(g, x, w, b):
        gx, gw, gb = x, w, b
        for _ in range(32):
            gx, gw, gb = g(gx, gw, gb, y)
        return gx, gw, gb

-    time_fn(layer_norm_grad_loop, g1, x, w, b)
-    time_fn(layer_norm_grad_loop, g2, x, w, b)
-    time_fn(layer_norm_grad_loop, mx.compile(g1), x, w, b)
-    time_fn(layer_norm_grad_loop, mx.compile(g2), x, w, b)
+    time_fn(layer_norm_loop, g1, x, w, b)
+    time_fn(layer_norm_loop, g2, x, w, b)
+    time_fn(layer_norm_loop, mx.compile(g1), x, w, b)
+    time_fn(layer_norm_loop, mx.compile(g2), x, w, b)

    f1 = lambda x, y: (layer_norm(x, None, None, 1e-5) * y).sum()
    f2 = lambda x, y: (mx.fast.layer_norm(x, None, None, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0,))
    g2 = mx.grad(f2, argnums=(0,))

-    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
-    w = mx.random.uniform(shape=(N,)).astype(dt)
-    b = mx.random.uniform(shape=(N,)).astype(dt)
-    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
    mx.eval(x, w, b, y)

-    def layer_norm_grad_x_loop(g, x):
+    def layer_norm_loop(g, x):
        gx = x
        for _ in range(32):
            gx = g(gx, y)
        return gx

-    time_fn(layer_norm_grad_x_loop, g1, x)
-    time_fn(layer_norm_grad_x_loop, g2, x)
-    time_fn(layer_norm_grad_x_loop, mx.compile(g1), x)
-    time_fn(layer_norm_grad_x_loop, mx.compile(g2), x)
+    time_fn(layer_norm_loop, g1, x)
+    time_fn(layer_norm_loop, g2, x)
+    time_fn(layer_norm_loop, mx.compile(g1), x)
+    time_fn(layer_norm_loop, mx.compile(g2), x)


 if __name__ == "__main__":
-    for dt in [mx.float32, mx.float16, mx.bfloat16]:
-        for n in [1024, 2048, 4096, 8192, 8192 + 1024]:
-            print(dt, n)
-            time_layer_norm(n, dt)
+    time_layer_norm()
--- a/benchmarks/python/single_ops.py
+++ b/benchmarks/python/single_ops.py
@@ -51,20 +51,6 @@ def time_maximum():
    time_fn(mx.maximum, a, b)


-def time_max():
-    a = mx.random.uniform(shape=(32, 1024, 1024))
-    a[1, 1] = mx.nan
-    mx.eval(a)
-    time_fn(mx.max, a, 0)
-
-
-def time_min():
-    a = mx.random.uniform(shape=(32, 1024, 1024))
-    a[1, 1] = mx.nan
-    mx.eval(a)
-    time_fn(mx.min, a, 0)
-
-
 def time_negative():
    a = mx.random.uniform(shape=(10000, 1000))
    mx.eval(a)
@@ -122,8 +108,6 @@ if __name__ == "__main__":

    time_add()
    time_matmul()
-    time_min()
-    time_max()
    time_maximum()
    time_exp()
    time_negative()
--- a/cmake/extension.cmake
+++ b/cmake/extension.cmake
@@ -11,14 +11,13 @@ include(CMakeParseArguments)
 # Args: TARGET: Custom target to be added for the metal library TITLE: Name of
 # the .metallib OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib SOURCES: List
 # of source files INCLUDE_DIRS: List of include dirs DEPS: List of dependency
-# files (like headers) DEBUG: Boolean, if true, enables debug compile options
-# for this specific library. If not provided, uses global MLX_METAL_DEBUG.
+# files (like headers)
 #
 # clang format on

 macro(mlx_build_metallib)
  # Parse args
-  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY DEBUG)
+  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY)
  set(multiValueArgs SOURCES INCLUDE_DIRS DEPS)
  cmake_parse_arguments(MTLLIB "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -27,10 +26,6 @@ macro(mlx_build_metallib)

  # Collect compile options
  set(MTLLIB_COMPILE_OPTIONS -Wall -Wextra -fno-fast-math -Wno-c++17-extensions)
-  if(MLX_METAL_DEBUG OR MTLLIB_DEBUG)
-    set(MTLLIB_COMPILE_OPTIONS ${MTLLIB_COMPILE_OPTIONS} -gline-tables-only
-                               -frecord-sources)
-  endif()

  # Prepare metallib build command
  add_custom_command(
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -10,7 +10,7 @@ import mlx.core as mx
 # -- Project information -----------------------------------------------------

 project = "MLX"
-copyright = "2023, Apple"
+copyright = "2023, MLX Contributors"
 author = "MLX Contributors"
 version = ".".join(mx.__version__.split(".")[:3])
 release = version
--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@@ -8,26 +8,23 @@ MLX supports writing custom Metal kernels through the Python and C++ APIs.
 Simple Example
 --------------

-.. currentmodule:: mlx.core
-
 Let's write a custom kernel that computes ``exp`` elementwise:

 .. code-block:: python

-  source = """
-      uint elem = thread_position_in_grid.x;
-      T tmp = inp[elem];
-      out[elem] = metal::exp(tmp);
-  """
-
-  kernel = mx.fast.metal_kernel(
-      name="myexp",
-      input_names=["inp"],
-      output_names=["out"],
-      source=source,
-  )
-
  def exp_elementwise(a: mx.array):
+      source = """
+          uint elem = thread_position_in_grid.x;
+          T tmp = inp[elem];
+          out[elem] = metal::exp(tmp);
+      """
+
+      kernel = mx.fast.metal_kernel(
+          name="myexp",
+          input_names=["inp"],
+          output_names=["out"],
+          source=source,
+      )
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@@ -42,13 +39,8 @@ Let's write a custom kernel that computes ``exp`` elementwise:
  b = exp_elementwise(a)
  assert mx.allclose(b, mx.exp(a))

-Every time you make a kernel, a new Metal library is created and possibly
-JIT compiled. To reduce the overhead from that, build the kernel once with
-:func:`fast.metal_kernel` and then use it many times.
-
 .. note::
-   Only pass the body of the Metal kernel in ``source``. The function
-   signature is generated automatically.
+    We are only required to pass the body of the Metal kernel in ``source``.

 The full function signature will be generated using:

@@ -86,51 +78,44 @@ Putting this all together, the generated function signature for ``myexp`` is as

  template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;

-Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads
-<https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_
-function. This means we will launch ``mx.prod(grid)`` threads, subdivided into
-``threadgroup`` size threadgroups.  For optimal performance, each thread group
-dimension should be less than or equal to the corresponding grid dimension.
+Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads <https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_ function.
+This means we will launch ``mx.prod(grid)`` threads, subdivided into ``threadgroup`` size threadgroups.
+For optimal performance, each thread group dimension should be less than or equal to the corresponding grid dimension.

-Passing ``verbose=True`` to :func:`ast.metal_kernel.__call__` will print the
-generated code for debugging purposes.
+Passing ``verbose=True`` to ``mx.fast.metal_kernel.__call__`` will print the generated code for debugging purposes.

 Using Shape/Strides
 -------------------

-:func:`fast.metal_kernel` supports an argument ``ensure_row_contiguous`` which
-is ``True`` by default. This will copy the array inputs if needed
-before the kernel is launched to ensure that the memory layout is row
-contiguous.  Generally this makes writing the kernel easier, since we don't
-have to worry about gaps or the ordering of the dims when indexing.
+``mx.fast.metal_kernel`` supports an argument ``ensure_row_contiguous`` which is ``True`` by default.
+This will copy the ``mx.array`` inputs if needed before the kernel is launched to ensure that the memory layout is row contiguous.
+Generally this makes writing the kernel easier, since we don't have to worry about gaps or the ordering of the dims
+when indexing.

-If we want to avoid this copy, :func:`fast.metal_kernel` automatically passes
-``a_shape``, ``a_strides`` and ``a_ndim`` for each input array ``a`` if any are
-present in ``source``. We can then use MLX's built in indexing utils to fetch
-the right elements for each thread.
+If we want to avoid this copy, ``metal_kernel`` automatically passes ``a_shape``, ``a_strides`` and ``a_ndim`` for each
+input array ``a`` if any are present in ``source``.
+We can then use MLX's built in indexing utils to fetch the right elements for each thread.

-Let's convert ``myexp`` above to support arbitrarily strided arrays without
-relying on a copy from ``ensure_row_contiguous``:
+Let's convert ``myexp`` above to support arbitrarily strided arrays without relying on a copy from ``ensure_row_contiguous``:

 .. code-block:: python
-   
-  source = """
-      uint elem = thread_position_in_grid.x;
-      // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
-      uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
-      T tmp = inp[loc];
-      // Output arrays are always row contiguous
-      out[elem] = metal::exp(tmp);
-  """
-
-  kernel = mx.fast.metal_kernel(
-      name="myexp_strided",
-      input_names=["inp"],
-      output_names=["out"],
-      source=source
-  )

  def exp_elementwise(a: mx.array):
+      source = """
+          uint elem = thread_position_in_grid.x;
+          // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
+          uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
+          T tmp = inp[loc];
+          // Output arrays are always row contiguous
+          out[elem] = metal::exp(tmp);
+      """
+
+      kernel = mx.fast.metal_kernel(
+          name="myexp_strided",
+          input_names=["inp"],
+          output_names=["out"],
+          source=source
+      )
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@@ -157,139 +142,137 @@ We'll start with the following MLX implementation using standard ops:

 .. code-block:: python

-  def grid_sample_ref(x, grid):
-      N, H_in, W_in, _ = x.shape
-      ix = ((grid[..., 0] + 1) * W_in - 1) / 2
-      iy = ((grid[..., 1] + 1) * H_in - 1) / 2
+    def grid_sample_ref(x, grid):
+        N, H_in, W_in, _ = x.shape
+        ix = ((grid[..., 0] + 1) * W_in - 1) / 2
+        iy = ((grid[..., 1] + 1) * H_in - 1) / 2

-      ix_nw = mx.floor(ix).astype(mx.int32)
-      iy_nw = mx.floor(iy).astype(mx.int32)
+        ix_nw = mx.floor(ix).astype(mx.int32)
+        iy_nw = mx.floor(iy).astype(mx.int32)

-      ix_ne = ix_nw + 1
-      iy_ne = iy_nw
+        ix_ne = ix_nw + 1
+        iy_ne = iy_nw

-      ix_sw = ix_nw
-      iy_sw = iy_nw + 1
+        ix_sw = ix_nw
+        iy_sw = iy_nw + 1

-      ix_se = ix_nw + 1
-      iy_se = iy_nw + 1
+        ix_se = ix_nw + 1
+        iy_se = iy_nw + 1

-      nw = (ix_se - ix)    * (iy_se - iy)
-      ne = (ix    - ix_sw) * (iy_sw - iy)
-      sw = (ix_ne - ix)    * (iy    - iy_ne)
-      se = (ix    - ix_nw) * (iy    - iy_nw)
+        nw = (ix_se - ix)    * (iy_se - iy)
+        ne = (ix    - ix_sw) * (iy_sw - iy)
+        sw = (ix_ne - ix)    * (iy    - iy_ne)
+        se = (ix    - ix_nw) * (iy    - iy_nw)

-      I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
-      I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
-      I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
-      I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]
+        I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
+        I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
+        I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
+        I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]

-      mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
-      mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
-      mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
-      mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)
+        mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
+        mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
+        mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
+        mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)

-      I_nw *= mask_nw[..., None]
-      I_ne *= mask_ne[..., None]
-      I_sw *= mask_sw[..., None]
-      I_se *= mask_se[..., None]
+        I_nw *= mask_nw[..., None]
+        I_ne *= mask_ne[..., None]
+        I_sw *= mask_sw[..., None]
+        I_se *= mask_se[..., None]

-      output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se
+        output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se

-      return output
+        return output

-Now let's use :func:`custom_function` together with :func:`fast.metal_kernel`
+Now let's use ``mx.custom_function`` together with ``mx.fast.metal_kernel``
 to write a fast GPU kernel for both the forward and backward passes.

 First we'll implement the forward pass as a fused kernel:

 .. code-block:: python

-  source = """
-      uint elem = thread_position_in_grid.x;
-      int H = x_shape[1];
-      int W = x_shape[2];
-      int C = x_shape[3];
-      int gH = grid_shape[1];
-      int gW = grid_shape[2];
+    @mx.custom_function
+    def grid_sample(x, grid):

-      int w_stride = C;
-      int h_stride = W * w_stride;
-      int b_stride = H * h_stride;
+        assert x.ndim == 4, "`x` must be 4D."
+        assert grid.ndim == 4, "`grid` must be 4D."

-      uint grid_idx = elem / C * 2;
-      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+        B, _, _, C = x.shape
+        _, gN, gM, D = grid.shape
+        out_shape = (B, gN, gM, C)

-      int ix_nw = floor(ix);
-      int iy_nw = floor(iy);
+        assert D == 2, "Last dim of `grid` must be size 2."

-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
+        source = """
+            uint elem = thread_position_in_grid.x;
+            int H = x_shape[1];
+            int W = x_shape[2];
+            int C = x_shape[3];
+            int gH = grid_shape[1];
+            int gW = grid_shape[2];

-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
+            int w_stride = C;
+            int h_stride = W * w_stride;
+            int b_stride = H * h_stride;

-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
+            uint grid_idx = elem / C * 2;
+            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

-      T nw = (ix_se - ix)    * (iy_se - iy);
-      T ne = (ix    - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix)    * (iy    - iy_ne);
-      T se = (ix    - ix_nw) * (iy    - iy_nw);
+            int ix_nw = floor(ix);
+            int iy_nw = floor(iy);

-      int batch_idx = elem / C / gH / gW * b_stride;
-      int channel_idx = elem % C;
-      int base_idx = batch_idx + channel_idx;
+            int ix_ne = ix_nw + 1;
+            int iy_ne = iy_nw;

-      T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
-      T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
-      T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
-      T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];
+            int ix_sw = ix_nw;
+            int iy_sw = iy_nw + 1;

-      I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
-      I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
-      I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
-      I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;
+            int ix_se = ix_nw + 1;
+            int iy_se = iy_nw + 1;

-      out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
-  """
+            T nw = (ix_se - ix)    * (iy_se - iy);
+            T ne = (ix    - ix_sw) * (iy_sw - iy);
+            T sw = (ix_ne - ix)    * (iy    - iy_ne);
+            T se = (ix    - ix_nw) * (iy    - iy_nw);

-  kernel = mx.fast.metal_kernel(
-      name="grid_sample",
-      input_names=["x", "grid"],
-      output_names=["out"],
-      source=source,
-  )
+            int batch_idx = elem / C / gH / gW * b_stride;
+            int channel_idx = elem % C;
+            int base_idx = batch_idx + channel_idx;

-  @mx.custom_function
-  def grid_sample(x, grid):
+            T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
+            T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
+            T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
+            T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];

-      assert x.ndim == 4, "`x` must be 4D."
-      assert grid.ndim == 4, "`grid` must be 4D."
+            I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
+            I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
+            I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
+            I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;

-      B, _, _, C = x.shape
-      _, gN, gM, D = grid.shape
-      out_shape = (B, gN, gM, C)
-
-      assert D == 2, "Last dim of `grid` must be size 2."
-
-      outputs = kernel(
-          inputs=[x, grid],
-          template=[("T", x.dtype)],
-          output_shapes=[out_shape],
-          output_dtypes=[x.dtype],
-          grid=(np.prod(out_shape), 1, 1),
-          threadgroup=(256, 1, 1),
-      )
-      return outputs[0]
+            out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
+        """
+        kernel = mx.fast.metal_kernel(
+            name="grid_sample",
+            input_names=["x", "grid"],
+            output_names=["out"],
+            source=source,
+        )
+        outputs = kernel(
+            inputs=[x, grid],
+            template=[("T", x.dtype)],
+            output_shapes=[out_shape],
+            output_dtypes=[x.dtype],
+            grid=(np.prod(out_shape), 1, 1),
+            threadgroup=(256, 1, 1),
+        )
+        return outputs[0]

 For a reasonably sized input such as:

 .. code-block:: python

-  x.shape = (8, 1024, 1024, 64)
-  grid.shape = (8, 256, 256, 2)
+    x.shape = (8, 1024, 1024, 64)
+    grid.shape = (8, 256, 256, 2)

 On an M1 Max, we see a big performance improvement:

@@ -298,11 +281,11 @@ On an M1 Max, we see a big performance improvement:
 Grid Sample VJP
 ---------------

-Since we decorated ``grid_sample`` with :func:`custom_function`, we can now
-define its custom vjp transform so MLX can differentiate it.
+Since we decorated ``grid_sample`` with ``mx.custom_function``, we can now define
+its custom vjp transform so MLX can differentiate it.

 The backwards pass requires atomically updating ``x_grad``/``grid_grad`` and so
-requires a few extra :func:`fast.metal_kernel` features:
+requires a few extra ``mx.fast.metal_kernel`` features:

 * ``init_value=0``
    Initialize all of the kernel's outputs to this value before it runs. This allows us to update only part of the output arrays with the kernel.
@@ -316,129 +299,128 @@ We can then implement the backwards pass as follows:

 .. code-block:: python

-  source = """
-      uint elem = thread_position_in_grid.x;
-      int H = x_shape[1];
-      int W = x_shape[2];
-      int C = x_shape[3];
-      // Pad C to the nearest larger simdgroup size multiple
-      int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;
+    @grid_sample.vjp
+    def grid_sample_vjp(primals, cotangent, _):
+        x, grid = primals
+        B, _, _, C = x.shape
+        _, gN, gM, D = grid.shape

-      int gH = grid_shape[1];
-      int gW = grid_shape[2];
+        assert D == 2, "Last dim of `grid` must be size 2."

-      int w_stride = C;
-      int h_stride = W * w_stride;
-      int b_stride = H * h_stride;
+        source = """
+            uint elem = thread_position_in_grid.x;
+            int H = x_shape[1];
+            int W = x_shape[2];
+            int C = x_shape[3];
+            // Pad C to the nearest larger simdgroup size multiple
+            int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;

-      uint grid_idx = elem / C_padded * 2;
-      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+            int gH = grid_shape[1];
+            int gW = grid_shape[2];

-      int ix_nw = floor(ix);
-      int iy_nw = floor(iy);
+            int w_stride = C;
+            int h_stride = W * w_stride;
+            int b_stride = H * h_stride;

-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
+            uint grid_idx = elem / C_padded * 2;
+            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
+            int ix_nw = floor(ix);
+            int iy_nw = floor(iy);

-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
+            int ix_ne = ix_nw + 1;
+            int iy_ne = iy_nw;

-      T nw = (ix_se - ix)    * (iy_se - iy);
-      T ne = (ix    - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix)    * (iy    - iy_ne);
-      T se = (ix    - ix_nw) * (iy    - iy_nw);
+            int ix_sw = ix_nw;
+            int iy_sw = iy_nw + 1;

-      int batch_idx = elem / C_padded / gH / gW * b_stride;
-      int channel_idx = elem % C_padded;
-      int base_idx = batch_idx + channel_idx;
+            int ix_se = ix_nw + 1;
+            int iy_se = iy_nw + 1;

-      T gix = T(0);
-      T giy = T(0);
-      if (channel_idx < C) {
-          int cot_index = elem / C_padded * C + channel_idx;
-          T cot = cotangent[cot_index];
-          if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
-              int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
-              atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);
+            T nw = (ix_se - ix)    * (iy_se - iy);
+            T ne = (ix    - ix_sw) * (iy_sw - iy);
+            T sw = (ix_ne - ix)    * (iy    - iy_ne);
+            T se = (ix    - ix_nw) * (iy    - iy_nw);

-              T I_nw = x[offset];
-              gix -= I_nw * (iy_se - iy) * cot;
-              giy -= I_nw * (ix_se - ix) * cot;
-          }
-          if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
-              int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
-              atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);
+            int batch_idx = elem / C_padded / gH / gW * b_stride;
+            int channel_idx = elem % C_padded;
+            int base_idx = batch_idx + channel_idx;

-              T I_ne = x[offset];
-              gix += I_ne * (iy_sw - iy) * cot;
-              giy -= I_ne * (ix - ix_sw) * cot;
-          }
-          if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
-              int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
-              atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);
+            T gix = T(0);
+            T giy = T(0);
+            if (channel_idx < C) {
+                int cot_index = elem / C_padded * C + channel_idx;
+                T cot = cotangent[cot_index];
+                if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
+                    int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
+                    atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);

-              T I_sw = x[offset];
-              gix -= I_sw * (iy - iy_ne) * cot;
-              giy += I_sw * (ix_ne - ix) * cot;
-          }
-          if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
-              int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
-              atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);
+                    T I_nw = x[offset];
+                    gix -= I_nw * (iy_se - iy) * cot;
+                    giy -= I_nw * (ix_se - ix) * cot;
+                }
+                if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
+                    int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
+                    atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);

-              T I_se = x[offset];
-              gix += I_se * (iy - iy_nw) * cot;
-              giy += I_se * (ix - ix_nw) * cot;
-          }
-      }
+                    T I_ne = x[offset];
+                    gix += I_ne * (iy_sw - iy) * cot;
+                    giy -= I_ne * (ix - ix_sw) * cot;
+                }
+                if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
+                    int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
+                    atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);

-      T gix_mult = W / 2;
-      T giy_mult = H / 2;
+                    T I_sw = x[offset];
+                    gix -= I_sw * (iy - iy_ne) * cot;
+                    giy += I_sw * (ix_ne - ix) * cot;
+                }
+                if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
+                    int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
+                    atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);

-      // Reduce across each simdgroup first.
-      // This is much faster than relying purely on atomics.
-      gix = simd_sum(gix);
-      giy = simd_sum(giy);
+                    T I_se = x[offset];
+                    gix += I_se * (iy - iy_nw) * cot;
+                    giy += I_se * (ix - ix_nw) * cot;
+                }
+            }

-      if (thread_index_in_simdgroup == 0) {
-          atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
-          atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
-      }
-  """
-  kernel = mx.fast.metal_kernel(
-      name="grid_sample_grad",
-      input_names=["x", "grid", "cotangent"],
-      output_names=["x_grad", "grid_grad"],
-      source=source,
-      atomic_outputs=True,
-  )
+            T gix_mult = W / 2;
+            T giy_mult = H / 2;

-  @grid_sample.vjp
-  def grid_sample_vjp(primals, cotangent, _):
-      x, grid = primals
-      B, _, _, C = x.shape
-      _, gN, gM, D = grid.shape
+            // Reduce across each simdgroup first.
+            // This is much faster than relying purely on atomics.
+            gix = simd_sum(gix);
+            giy = simd_sum(giy);

-      assert D == 2, "Last dim of `grid` must be size 2."
-
-      # pad the output channels to simd group size
-      # so that our `simd_sum`s don't overlap.
-      simdgroup_size = 32
-      C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
-      grid_size = B * gN * gM * C_padded
-      outputs = kernel(
-          inputs=[x, grid, cotangent],
-          template=[("T", x.dtype)],
-          output_shapes=[x.shape, grid.shape],
-          output_dtypes=[x.dtype, x.dtype],
-          grid=(grid_size, 1, 1),
-          threadgroup=(256, 1, 1),
-          init_value=0,
-      )
-      return outputs[0], outputs[1]
+            if (thread_index_in_simdgroup == 0) {
+                atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
+                atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
+            }
+        """
+        kernel = mx.fast.metal_kernel(
+            name="grid_sample_grad",
+            input_names=["x", "grid", "cotangent"],
+            output_names=["x_grad", "grid_grad"],
+            source=source,
+            atomic_outputs=True,
+        )
+        # pad the output channels to simd group size
+        # so that our `simd_sum`s don't overlap.
+        simdgroup_size = 32
+        C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
+        grid_size = B * gN * gM * C_padded
+        outputs = kernel(
+            inputs=[x, grid, cotangent],
+            template=[("T", x.dtype)],
+            output_shapes=[x.shape, grid.shape],
+            output_dtypes=[x.dtype, x.dtype],
+            grid=(grid_size, 1, 1),
+            threadgroup=(256, 1, 1),
+            init_value=0,
+        )
+        return outputs[0], outputs[1]

 There's an even larger speed up for the vjp:

--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -138,13 +138,13 @@ more concrete:
        * representing the vectorized computation and the axis which
        * corresponds to the output vectorized dimension.
        */
-        std::pair<std::vector<array>, std::vector<int>> vmap(
+        virtual std::pair<std::vector<array>, std::vector<int>> vmap(
            const std::vector<array>& inputs,
            const std::vector<int>& axes) override;

-        /** The name of primitive. */
-        const char* name() const override {
-          return "Axpby";
+        /** Print the primitive. */
+        void print(std::ostream& os) override {
+            os << "Axpby";
        }

        /** Equivalence check **/
@@ -397,11 +397,11 @@ below.
        std::ostringstream kname;
        kname << "axpby_" << "general_" << type_to_name(out);

-        // Load the metal library
-        auto lib = d.get_library("mlx_ext");
+        // Make sure the metal library is available
+        d.register_library("mlx_ext");

        // Make a kernel from this metal library
-        auto kernel = d.get_kernel(kname.str(), lib);
+        auto kernel = d.get_kernel(kname.str(), "mlx_ext");

        // Prepare to encode kernel
        auto& compute_encoder = d.get_command_encoder(s.index);
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -23,24 +23,13 @@ To install from PyPI you must meet the following requirements:
    MLX is only available on devices running macOS >= 13.5
    It is highly recommended to use macOS 14 (Sonoma)

-CUDA
-^^^^

-MLX has a CUDA backend which you can use on any Linux platform with CUDA 12
-and SM 7.0 (Volta) and up. To install MLX with CUDA support, run:
+MLX is also available on conda-forge. To install MLX with conda do:

 .. code-block:: shell

-    pip install "mlx[cuda]"
+   conda install conda-forge::mlx

-CPU-only (Linux)
-^^^^^^^^^^^^^^^^
-
-For a CPU-only version of MLX that runs on Linux use:
-
-.. code-block:: shell
-
-    pip install "mlx[cpu]"

 Troubleshooting
 ^^^^^^^^^^^^^^^
@@ -76,8 +65,6 @@ Build Requirements
 Python API
 ^^^^^^^^^^

-.. _python install:
-
 To build and install the MLX python library from source, first, clone MLX from
 `its GitHub repo <https://github.com/ml-explore/mlx>`_:

@@ -89,20 +76,20 @@ Then simply build and install MLX using pip:

 .. code-block:: shell

-  pip install .
+  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install .

 For developing, install the package with development dependencies, and use an
 editable install:

 .. code-block:: shell

-  pip install -e ".[dev]"
+  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -e ".[dev]"

 Once the development dependencies are installed, you can build faster with:

 .. code-block:: shell

- python setup.py build_ext --inplace
+ CMAKE_BUILD_PARALLEL_LEVEL=8 python setup.py build_ext --inplace

 Run the tests with:

@@ -120,8 +107,6 @@ IDE:
 C++ API
 ^^^^^^^

-.. _cpp install:
-
 Currently, MLX must be built and installed from source.

 Similarly to the python library, to build and install the MLX C++ library start
@@ -200,7 +185,6 @@ should point to the path to the built metal library.

      xcrun -sdk macosx --show-sdk-version

-
 Binary Size Minimization
 ~~~~~~~~~~~~~~~~~~~~~~~~

@@ -229,50 +213,6 @@ be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
 Metal kernel cache persists across reboots.

-Linux
-^^^^^
-
-To build from source on Linux (CPU only), install the BLAS and LAPACK headers.
-For example on Ubuntu, run the following:
-
-.. code-block:: shell
-
-   apt-get update -y
-   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
-
-From here follow the instructions to install either the :ref:`Python <python
-install>` or :ref:`C++ <cpp install>` APIs.
-
-CUDA
-^^^^
-
-To build from source on Linux with CUDA, install the BLAS and LAPACK headers
-and the CUDA toolkit. For example on Ubuntu, run the following:
-
-.. code-block:: shell
-
-   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-   dpkg -i cuda-keyring_1.1-1_all.deb
-   apt-get update -y
-   apt-get -y install cuda-toolkit-12-9
-   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
-
-
-When building either the Python or C++ APIs make sure to pass the cmake flag
-``MLX_BUILD_CUDA=ON``. For example, to build the Python API run:
-
-.. code-block:: shell
-
-  CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"
-
-To build the C++ package run:
-
-.. code-block:: shell
-
-   mkdir -p build && cd build
-   cmake .. -DMLX_BUILD_CUDA=ON && make -j
-
-
 Troubleshooting
 ^^^^^^^^^^^^^^^

--- a/docs/src/python/array.rst
+++ b/docs/src/python/array.rst
@@ -19,8 +19,6 @@ Array
    array.ndim
    array.shape
    array.size
-    array.real
-    array.imag
    array.abs
    array.all
    array.any
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -16,8 +16,6 @@ Linear Algebra
    cross
    qr
    svd
-    eigvals
-    eig
    eigvalsh
    eigh
    lu
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -107,16 +107,6 @@ same array:
  >>> a
  array([1, 2, 0], dtype=int32)

-
-Note, unlike NumPy, updates to the same location are nondeterministic:
-
-.. code-block:: shell
-
-  >>> a = mx.array([1, 2, 3])
-  >>> a[[0, 0]] = mx.array([4, 5])
-
-The first element of ``a`` could be ``4`` or ``5``.
-
 Transformations of functions which use in-place updates are allowed and work as
 expected. For example:

--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -172,11 +172,11 @@ void Axpby::eval_gpu(
  kname << (contiguous_kernel ? "contiguous_" : "general_");
  kname << type_to_name(out);

-  // Load the metal library
-  auto lib = d.get_library("mlx_ext");
+  // Make sure the metal library is available
+  d.register_library("mlx_ext");

  // Make a kernel from this metal library
-  auto kernel = d.get_kernel(kname.str(), lib);
+  auto kernel = d.get_kernel(kname.str(), "mlx_ext");

  // Prepare to encode kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
--- a/examples/extensions/axpby/axpby.h
+++ b/examples/extensions/axpby/axpby.h
@@ -74,9 +74,9 @@ class Axpby : public mx::Primitive {
      const std::vector<mx::array>& inputs,
      const std::vector<int>& axes) override;

-  /** The name of primitive. */
-  const char* name() const override {
-    return "Axpby";
+  /** Print the primitive. */
+  void print(std::ostream& os) override {
+    os << "Axpby";
  }

  /** Equivalence check **/
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -21,7 +21,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h)

 # Define MLX_VERSION only in the version.cpp file.
-add_library(mlx_version OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp)
+add_library(mlx_version STATIC ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp)
 target_compile_definitions(mlx_version PRIVATE MLX_VERSION="${MLX_VERSION}")
 target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:mlx_version>)

@@ -55,9 +55,6 @@ endif()

 if(MLX_BUILD_CUDA)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda)
-else()
-  target_sources(mlx
-                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda/no_cuda.cpp)
 endif()

 if(MLX_BUILD_METAL OR MLX_BUILD_CUDA)
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -224,10 +224,6 @@ class array {
    // Not copyable
    Data(const Data& d) = delete;
    Data& operator=(const Data& d) = delete;
-    Data(Data&& o) : buffer(o.buffer), d(o.d) {
-      o.buffer = allocator::Buffer(nullptr);
-      o.d = [](allocator::Buffer) {};
-    }
    ~Data() {
      d(buffer);
    }
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -6,4 +6,5 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/transpose.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)
--- a/mlx/backend/common/buffer_cache.h
+++ b/mlx/backend/common/buffer_cache.h
@@ -1,157 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include <cassert>
-#include <functional>
-#include <map>
-
-namespace mlx::core {
-
-template <typename T>
-class BufferCache {
- public:
-  BufferCache(
-      size_t page_size,
-      std::function<size_t(T*)> get_size,
-      std::function<void(T*)> free)
-      : page_size_(page_size),
-        get_size_(std::move(get_size)),
-        free_(std::move(free)) {}
-
-  ~BufferCache() {
-    clear();
-  }
-
-  BufferCache(const BufferCache&) = delete;
-  BufferCache& operator=(const BufferCache&) = delete;
-
-  T* reuse_from_cache(size_t size) {
-    // Find the closest buffer in pool.
-    auto it = buffer_pool_.lower_bound(size);
-    if (it == buffer_pool_.end() ||
-        it->first >= std::min(2 * size, size + 2 * page_size_)) {
-      return nullptr;
-    }
-
-    // Collect from the cache.
-    T* buf = it->second->buf;
-    pool_size_ -= it->first;
-
-    // Remove from record.
-    remove_from_list(it->second);
-    buffer_pool_.erase(it);
-    return buf;
-  }
-
-  void recycle_to_cache(T* buf) {
-    assert(buf);
-    // Add to cache.
-    BufferHolder* bh = new BufferHolder(buf);
-    add_at_head(bh);
-    size_t size = get_size_(buf);
-    pool_size_ += size;
-    buffer_pool_.emplace(size, bh);
-  }
-
-  int release_cached_buffers(size_t min_bytes_to_free) {
-    if (min_bytes_to_free >= 0.9 * pool_size_) {
-      return clear();
-    } else {
-      int n_release = 0;
-      size_t total_bytes_freed = 0;
-
-      while (tail_ && (total_bytes_freed < min_bytes_to_free)) {
-        // Release buffer.
-        size_t size = get_size_(tail_->buf);
-        total_bytes_freed += size;
-        free_(tail_->buf);
-        n_release++;
-
-        // Remove from record.
-        auto its = buffer_pool_.equal_range(size);
-        auto it = std::find_if(its.first, its.second, [this](const auto& el) {
-          return el.second == tail_;
-        });
-        assert(it != buffer_pool_.end());
-        buffer_pool_.erase(it);
-        remove_from_list(tail_);
-      }
-
-      pool_size_ -= total_bytes_freed;
-      return n_release;
-    }
-  }
-
-  int clear() {
-    int n_release = 0;
-    for (auto& [size, holder] : buffer_pool_) {
-      free_(holder->buf);
-      n_release++;
-      delete holder;
-    }
-    buffer_pool_.clear();
-    pool_size_ = 0;
-    head_ = nullptr;
-    tail_ = nullptr;
-    return n_release;
-  }
-
-  size_t cache_size() const {
-    return pool_size_;
-  }
-
-  size_t page_size() const {
-    return page_size_;
-  }
-
- private:
-  struct BufferHolder {
-   public:
-    explicit BufferHolder(T* buf_) : buf(buf_) {}
-
-    BufferHolder* prev{nullptr};
-    BufferHolder* next{nullptr};
-    T* buf;
-  };
-
-  void add_at_head(BufferHolder* to_add) {
-    if (!head_) {
-      head_ = to_add;
-      tail_ = to_add;
-    } else {
-      head_->prev = to_add;
-      to_add->next = head_;
-      head_ = to_add;
-    }
-  }
-
-  void remove_from_list(BufferHolder* to_remove) {
-    if (to_remove->prev && to_remove->next) { // if middle
-      to_remove->prev->next = to_remove->next;
-      to_remove->next->prev = to_remove->prev;
-    } else if (to_remove->prev && to_remove == tail_) { // if tail
-      tail_ = to_remove->prev;
-      tail_->next = nullptr;
-    } else if (to_remove == head_ && to_remove->next) { // if head
-      head_ = to_remove->next;
-      head_->prev = nullptr;
-    } else if (to_remove == head_ && to_remove == tail_) { // if only element
-      head_ = nullptr;
-      tail_ = nullptr;
-    }
-
-    delete to_remove;
-  }
-
-  std::multimap<size_t, BufferHolder*> buffer_pool_;
-  BufferHolder* head_{nullptr};
-  BufferHolder* tail_{nullptr};
-  size_t pool_size_{0};
-
-  const size_t page_size_;
-  std::function<size_t(T*)> get_size_;
-  std::function<void(T*)> free_;
-};
-
-} // namespace mlx::core
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -2,6 +2,7 @@
 #include <cassert>

 #include "mlx/backend/common/broadcasting.h"
+#include "mlx/backend/common/transpose.h"
 #include "mlx/backend/common/utils.h"
 #include "mlx/primitives.h"

@@ -19,26 +20,19 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {
        "AsStrided must be used with row contiguous arrays only.");
  }

-  // Compute the flags given the shape and strides
-  bool row_contiguous = true, col_contiguous = true;
-  size_t r = 1, c = 1;
-  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
-    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
-    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
-    r *= shape_[i];
-    c *= shape_[j];
-  }
+  // Calculate the contiguity based on the given shape and strides
+  auto [ds, rc, cc] = check_contiguity(shape_, strides_);
  auto flags = in.flags();
+
  // TODO: Compute the contiguous flag in a better way cause now we are
  //       unnecessarily strict.
-  flags.contiguous = row_contiguous || col_contiguous;
-  flags.row_contiguous = row_contiguous;
-  flags.col_contiguous = col_contiguous;
+  flags.contiguous = rc || cc;
+  flags.row_contiguous = rc;
+  flags.col_contiguous = cc;

-  // There is no easy way to compute the actual data size so we use out.size().
-  // The contiguous flag will almost certainly not be set so no code should
-  // rely on data_size anyway.
-  size_t data_size = out.size();
+  // There is no easy way to compute the actual data size so we use out.size()
+  // when the array is not contiguous.
+  size_t data_size = flags.contiguous ? ds : out.size();

  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
 }
@@ -270,36 +264,7 @@ void StopGradient::eval(const std::vector<array>& inputs, array& out) {

 void Transpose::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  Strides out_strides(out.ndim());
-  auto& in = inputs[0];
-  for (int ax = 0; ax < axes_.size(); ++ax) {
-    out_strides[ax] = in.strides()[axes_[ax]];
-  }
-
-  // Conditions for {row/col}_contiguous
-  // - array must be contiguous (no gaps)
-  // - underlying buffer size should have the same size as the array
-  // - cumulative product of shapes is equal to the strides (we can ignore axes
-  //   with size == 1)
-  //   - in the forward direction (column contiguous)
-  //   - in the reverse direction (row contiguous)
-  // - vectors are both row and col contiguous (hence if both row/col are
-  //   true, they stay true)
-  auto flags = in.flags();
-  if (flags.contiguous && in.data_size() == in.size()) {
-    int64_t f_stride = 1;
-    int64_t b_stride = 1;
-    flags.col_contiguous = true;
-    flags.row_contiguous = true;
-    for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {
-      flags.col_contiguous &= (out_strides[i] == f_stride || out.shape(i) == 1);
-      f_stride *= out.shape(i);
-      flags.row_contiguous &=
-          (out_strides[ri] == b_stride || out.shape(ri) == 1);
-      b_stride *= out.shape(ri);
-    }
-  }
-  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
+  transpose(inputs[0], out, axes_);
 }

 } // namespace mlx::core
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -1,7 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/backend/common/compiled.h"
-#include "mlx/backend/common/utils.h"
+#include "mlx/graph_utils.h"
+#include "mlx/primitives.h"
 #include "mlx/utils.h"

 namespace mlx::core {
@@ -14,8 +15,6 @@ void print_constant(std::ostream& os, const array& x) {
      return print_float_constant<float16_t>(os, x);
    case bfloat16:
      return print_float_constant<bfloat16_t>(os, x);
-    case float64:
-      return print_float_constant<double>(os, x);
    case complex64:
      return print_complex_constant<complex64_t>(os, x);
    case int8:
@@ -52,8 +51,6 @@ std::string get_type_string(Dtype d) {
      return "float16_t";
    case bfloat16:
      return "bfloat16_t";
-    case float64:
-      return "double";
    case complex64:
      return "complex64_t";
    case bool_:
@@ -82,6 +79,55 @@ std::string get_type_string(Dtype d) {
  }
 }

+std::string build_lib_name(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
+    const std::vector<array>& tape,
+    const std::unordered_set<uintptr_t>& constant_ids) {
+  NodeNamer namer;
+  std::ostringstream os;
+  std::ostringstream constant_hasher;
+
+  // Fill the input names. This is not really necessary, I just like having A,
+  // B, C, ... as the inputs.
+  for (auto& x : inputs) {
+    namer.get_name(x);
+  }
+
+  // The primitives describing the tape. For unary and binary primitives this
+  // must be enough to describe the full computation.
+  for (auto& a : tape) {
+    // name and type of output
+    os << namer.get_name(a) << kindof(a.dtype()) << a.itemsize();
+    // computation performed
+    a.primitive().print(os);
+    // name of inputs to the function
+    for (auto& inp : a.inputs()) {
+      os << namer.get_name(inp);
+    }
+  }
+  os << "_";
+
+  for (auto& x : inputs) {
+    if (constant_ids.find(x.id()) != constant_ids.end()) {
+      os << "C";
+      print_constant(constant_hasher, x);
+    } else {
+      os << (is_scalar(x) ? "S" : "V");
+    }
+  }
+  os << "_";
+  for (auto& x : inputs) {
+    if (constant_ids.find(x.id()) != constant_ids.end()) {
+      continue;
+    }
+    os << kindof(x.dtype()) << x.itemsize();
+  }
+  os << "_" << std::hash<std::string>{}(constant_hasher.str());
+
+  return os.str();
+}
+
 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
    const Shape& shape) {
@@ -113,7 +159,8 @@ bool compiled_check_contiguity(
 void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::function<bool(size_t)>& is_constant,
+    const std::vector<array>& inputs_,
+    const std::unordered_set<uintptr_t>& constant_ids_,
    bool contiguous) {
  if (contiguous) {
    int o = 0;
@@ -128,7 +175,8 @@ void compiled_allocate_outputs(
      // - Donatable
      // - Not a constant
      if (in.itemsize() == outputs[o].itemsize() && !is_scalar(in) &&
-          in.is_donatable() && is_constant(i)) {
+          in.is_donatable() &&
+          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
        outputs[o++].copy_shared_buffer(in);
      }
      // Get representative input flags to properly set non-donated outputs
@@ -156,7 +204,7 @@ void compiled_allocate_outputs(
      // - Not a constant
      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
-          is_constant(i)) {
+          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
        outputs[o].copy_shared_buffer(
            in, outputs[o].strides(), in.flags(), in.data_size());
        o++;
@@ -168,74 +216,4 @@ void compiled_allocate_outputs(
  }
 }

-std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
-    const std::vector<array>& inputs,
-    const array& out,
-    const std::function<bool(size_t)>& is_constant) {
-  const Shape& shape = out.shape();
-  bool contiguous = compiled_check_contiguity(inputs, shape);
-  if (contiguous) {
-    return {true, shape, {}};
-  }
-
-  std::vector<Strides> strides_vec{out.strides()};
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    // Skip constants.
-    if (is_constant(i)) {
-      continue;
-    }
-
-    // Skip scalar inputs.
-    const auto& x = inputs[i];
-    if (is_scalar(x)) {
-      continue;
-    }
-
-    // Broadcast the inputs to the output shape.
-    Strides xstrides;
-    size_t j = 0;
-    for (; j < shape.size() - x.ndim(); ++j) {
-      if (shape[j] == 1) {
-        xstrides.push_back(out.strides()[j]);
-      } else {
-        xstrides.push_back(0);
-      }
-    }
-    for (size_t i = 0; i < x.ndim(); ++i, ++j) {
-      if (x.shape(i) == 1) {
-        if (shape[j] == 1) {
-          xstrides.push_back(out.strides()[j]);
-        } else {
-          xstrides.push_back(0);
-        }
-      } else {
-        xstrides.push_back(x.strides()[i]);
-      }
-    }
-    strides_vec.push_back(std::move(xstrides));
-  }
-
-  auto tup = collapse_contiguous_dims(shape, strides_vec, INT32_MAX);
-  return {false, std::move(std::get<0>(tup)), std::move(std::get<1>(tup))};
-}
-
-bool compiled_use_large_index(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs,
-    bool contiguous) {
-  if (contiguous) {
-    size_t max_size = 0;
-    for (const auto& in : inputs) {
-      max_size = std::max(max_size, in.data_size());
-    }
-    return max_size > UINT32_MAX;
-  } else {
-    size_t max_size = 0;
-    for (const auto& o : outputs) {
-      max_size = std::max(max_size, o.size());
-    }
-    return max_size > UINT32_MAX;
-  }
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/compiled.h
+++ b/mlx/backend/common/compiled.h
@@ -1,8 +1,9 @@
 // Copyright © 2023-2024 Apple Inc.
 #pragma once

-#include <functional>
 #include <iomanip>
+#include <sstream>
+#include <unordered_set>

 #include "mlx/array.h"
 #include "mlx/primitives.h"
@@ -13,17 +14,19 @@ inline bool is_static_cast(const Primitive& p) {
  return (typeid(p) == typeid(Broadcast) || typeid(p) == typeid(AsType));
 }

+std::string build_lib_name(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
+    const std::vector<array>& tape,
+    const std::unordered_set<uintptr_t>& constant_ids);
+
 std::string get_type_string(Dtype d);

 template <typename T>
 void print_float_constant(std::ostream& os, const array& x) {
  auto old_precision = os.precision();
-  if constexpr (std::is_same_v<T, double>) {
-    os << std::setprecision(std::numeric_limits<double>::digits10 + 1);
-  } else {
-    os << std::setprecision(std::numeric_limits<float>::digits10 + 1);
-  }
-  os << x.item<T>() << std::setprecision(old_precision);
+  os << std::setprecision(std::numeric_limits<float>::digits10 + 1)
+     << x.item<T>() << std::setprecision(old_precision);
 }

 template <typename T>
@@ -57,19 +60,8 @@ bool compiled_check_contiguity(
 void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::function<bool(size_t)>& is_constant,
-    bool contiguous);
-
-// Collapse contiguous dims ignoring scalars and constants.
-std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
-    const std::vector<array>& inputs,
-    const array& out,
-    const std::function<bool(size_t)>& is_constant);
-
-// Return whether the kernel should use large index.
-bool compiled_use_large_index(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs,
+    const std::vector<array>& inputs_,
+    const std::unordered_set<uintptr_t>& constant_ids_,
    bool contiguous);

 } // namespace mlx::core
--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@@ -2,7 +2,7 @@

 #pragma once

-#include "mlx/backend/common/utils.h"
+#include "mlx/array.h"

 namespace mlx::core {

@@ -26,7 +26,7 @@ inline bool set_copy_output_data(const array& in, array& out, CopyType ctype) {
  if (ctype == CopyType::Vector) {
    // If the input is donateable, we are doing a vector copy and the types
    // have the same size, then the input buffer can hold the output.
-    if (is_donatable(in, out)) {
+    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
      out.copy_shared_buffer(in);
      return true;
    } else {
--- a/mlx/backend/common/matmul.h
+++ b/mlx/backend/common/matmul.h
@@ -1,67 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include "mlx/backend/common/utils.h"
-#include "mlx/utils.h"
-
-#include <sstream>
-
-namespace mlx::core {
-
-inline std::tuple<Shape, Strides, Strides> collapse_batches(
-    const array& a,
-    const array& b) {
-  if (a.ndim() == 2) {
-    return {{1}, {0}, {0}};
-  }
-
-  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
-  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
-  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
-
-  auto [batch_shape, batch_strides] =
-      collapse_contiguous_dims(A_bshape, std::vector{A_bstride, B_bstride});
-
-  auto a_batch_strides = batch_strides[0];
-  auto b_batch_strides = batch_strides[1];
-
-  if (batch_shape.empty()) {
-    batch_shape.push_back(1);
-    a_batch_strides.push_back(0);
-    b_batch_strides.push_back(0);
-  }
-
-  return std::make_tuple(batch_shape, a_batch_strides, b_batch_strides);
-}
-
-inline std::tuple<Shape, Strides, Strides, Strides>
-collapse_batches(const array& a, const array& b, const array& c) {
-  if (a.ndim() == 2) {
-    return {{1}, {0}, {0}, {0}};
-  }
-
-  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
-  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
-  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
-  Strides C_bstride{c.strides().begin(), c.strides().end() - 2};
-
-  auto [batch_shape, batch_strides] = collapse_contiguous_dims(
-      A_bshape, std::vector{A_bstride, B_bstride, C_bstride});
-
-  auto A_batch_stride = batch_strides[0];
-  auto B_batch_stride = batch_strides[1];
-  auto C_batch_stride = batch_strides[2];
-
-  if (batch_shape.empty()) {
-    batch_shape.push_back(1);
-    A_batch_stride.push_back(0);
-    B_batch_stride.push_back(0);
-    C_batch_stride.push_back(0);
-  }
-
-  return std::make_tuple(
-      batch_shape, A_batch_stride, B_batch_stride, C_batch_stride);
-}
-
-} // namespace mlx::core
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -5,9 +5,11 @@
 namespace mlx::core {

 std::pair<Shape, Strides> shapes_without_reduction_axes(
-    Shape shape,
-    Strides strides,
+    const array& x,
    const std::vector<int>& axes) {
+  auto shape = x.shape();
+  auto strides = x.strides();
+
  for (int i = axes.size() - 1; i >= 0; i--) {
    int a = axes[i];
    shape.erase(shape.begin() + a);
@@ -17,15 +19,6 @@ std::pair<Shape, Strides> shapes_without_reduction_axes(
  return std::make_pair(shape, strides);
 }

-std::pair<Shape, Strides> shapes_without_reduction_axes(
-    const array& x,
-    const std::vector<int>& axes) {
-  auto shape = x.shape();
-  auto strides = x.strides();
-  return shapes_without_reduction_axes(
-      std::move(shape), std::move(strides), axes);
-}
-
 ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // The data is all there and we are reducing over everything
  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -51,9 +51,5 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);
 std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes);
-std::pair<Shape, Strides> shapes_without_reduction_axes(
-    Shape shape,
-    Strides strides,
-    const std::vector<int>& axes);

 } // namespace mlx::core
--- a/mlx/backend/common/transpose.cpp
+++ b/mlx/backend/common/transpose.cpp
@@ -0,0 +1,57 @@
+// Copyright © 2024 Apple Inc.
+
+#include <cassert>
+
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+void transpose(const array& in, array& out, const std::vector<int>& axes) {
+  Strides out_strides(out.ndim());
+  for (int ax = 0; ax < axes.size(); ++ax) {
+    out_strides[ax] = in.strides()[axes[ax]];
+  }
+
+  // Conditions for {row/col}_contiguous
+  // - array must be contiguous (no gaps)
+  // - underlying buffer size should have the same size as the array
+  // - cumulative product of shapes is equal to the strides (we can ignore axes
+  //   with size == 1)
+  //   - in the forward direction (column contiguous)
+  //   - in the reverse direction (row contiguous)
+  // - vectors are both row and col contiguous (hence if both row/col are
+  //   true, they stay true)
+  auto flags = in.flags();
+  if (flags.contiguous && in.data_size() == in.size()) {
+    auto [_, rc, cc] = check_contiguity(out.shape(), out_strides);
+    flags.row_contiguous = rc;
+    flags.col_contiguous = cc;
+  }
+  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
+}
+
+void as_transposed(array& out, const std::vector<int>& axes) {
+  assert(out.data_size() == out.size() && out.flags().contiguous);
+
+  // Calculate the contiguous strides.
+  Strides strides(out.ndim(), 1);
+  for (int i = out.ndim() - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * out.shape(i);
+  }
+
+  // Calculate the new strides for transposing.
+  Strides new_strides;
+  new_strides.reserve(out.ndim());
+  for (auto ax : axes) {
+    new_strides.push_back(strides[ax]);
+  }
+
+  auto [ds, rc, cc] = check_contiguity(out.shape(), new_strides);
+  auto flags = out.flags();
+  flags.row_contiguous = rc;
+  flags.col_contiguous = cc;
+
+  out.copy_shared_buffer(out, new_strides, flags, ds);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/transpose.h
+++ b/mlx/backend/common/transpose.h
@@ -0,0 +1,12 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+
+namespace mlx::core {
+
+void transpose(const array& in, array& out, const std::vector<int>& axes);
+void as_transposed(array& out, const std::vector<int>& axes);
+
+} // namespace mlx::core
--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@@ -1,26 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include "mlx/allocator.h"
-#include "mlx/backend/common/utils.h"
-
-namespace mlx::core {
-
-inline void set_unary_output_data(const array& in, array& out) {
-  if (in.flags().contiguous) {
-    if (is_donatable(in, out)) {
-      out.copy_shared_buffer(in);
-    } else {
-      out.set_data(
-          allocator::malloc(in.data_size() * out.itemsize()),
-          in.data_size(),
-          in.strides(),
-          in.flags());
-    }
-  } else {
-    out.set_data(allocator::malloc(out.nbytes()));
-  }
-}
-
-} // namespace mlx::core
--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@@ -1,22 +1,9 @@
 // Copyright © 2023-2024 Apple Inc.

-#include <dlfcn.h>
-
 #include "mlx/backend/common/utils.h"

 namespace mlx::core {

-std::filesystem::path current_binary_dir() {
-  static std::filesystem::path binary_dir = []() {
-    Dl_info info;
-    if (!dladdr(reinterpret_cast<void*>(&current_binary_dir), &info)) {
-      throw std::runtime_error("Unable to get current binary dir.");
-    }
-    return std::filesystem::path(info.dli_fname).parent_path();
-  }();
-  return binary_dir;
-}
-
 std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    const Shape& shape,
    const std::vector<Strides>& strides,
@@ -114,118 +101,4 @@ std::pair<Shape, Strides> collapse_contiguous_dims(
  return collapse_contiguous_dims(a.shape(), a.strides(), size_cap);
 }

-Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 /* = 10 */) {
-  int pows[3] = {0, 0, 0};
-  int sum = 0;
-  while (true) {
-    int presum = sum;
-    // Check all the pows
-    if (dim0 >= (1 << (pows[0] + 1))) {
-      pows[0]++;
-      sum++;
-    }
-    if (sum == 10) {
-      break;
-    }
-    if (dim1 >= (1 << (pows[1] + 1))) {
-      pows[1]++;
-      sum++;
-    }
-    if (sum == 10) {
-      break;
-    }
-    if (dim2 >= (1 << (pows[2] + 1))) {
-      pows[2]++;
-      sum++;
-    }
-    if (sum == presum || sum == pow2) {
-      break;
-    }
-  }
-  return std::make_tuple(1ul << pows[0], 1ul << pows[1], 1ul << pows[2]);
-}
-
-Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides) {
-  // Dims with strides of 0 are ignored as they
-  // correspond to broadcasted dimensions
-  size_t grid_x = 1;
-  size_t grid_y = 1;
-  for (int i = 0; i < shape.size(); ++i) {
-    if (strides[i] == 0) {
-      continue;
-    }
-    if (grid_x * shape[i] < UINT32_MAX) {
-      grid_x *= shape[i];
-    } else {
-      grid_y *= shape[i];
-    }
-  }
-  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
-    throw std::runtime_error("Unable to safely factor shape.");
-  }
-  if (grid_y > grid_x) {
-    std::swap(grid_x, grid_y);
-  }
-  return std::make_tuple(
-      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
-}
-
-Dims get_2d_grid_dims_common(
-    const Shape& shape,
-    const Strides& strides,
-    size_t divisor) {
-  // Compute the 2d grid dimensions such that the total size of the grid is
-  // divided by divisor.
-  size_t grid_x = 1;
-  size_t grid_y = 1;
-  for (int i = 0; i < shape.size(); ++i) {
-    if (strides[i] == 0) {
-      continue;
-    }
-
-    // No need to add this shape we can just remove it from the divisor.
-    if (divisor % shape[i] == 0) {
-      divisor /= shape[i];
-      continue;
-    }
-
-    if (grid_x * shape[i] < UINT32_MAX) {
-      grid_x *= shape[i];
-    } else {
-      grid_y *= shape[i];
-    }
-
-    if (divisor > 1) {
-      if (grid_x % divisor == 0) {
-        grid_x /= divisor;
-        divisor = 1;
-      } else if (grid_y % divisor == 0) {
-        grid_y /= divisor;
-        divisor = 1;
-      }
-    }
-  }
-  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
-    throw std::runtime_error("Unable to safely factor shape.");
-  }
-  if (grid_y > grid_x) {
-    std::swap(grid_x, grid_y);
-  }
-  if (divisor > 1) {
-    grid_x = ((grid_x + divisor - 1) / divisor) * divisor;
-  }
-  return std::make_tuple(
-      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
-}
-
-std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2) {
-  auto [bx, by, bz] = get_block_dims_common(dim0, dim1, dim2);
-  auto gx = (dim0 + bx - 1) / bx;
-  auto gy = (dim1 + by - 1) / by;
-  auto gz = (dim2 + bz - 1) / bz;
-
-  return std::make_pair(
-      std::make_tuple(gx, gy, gz), std::make_tuple(bx, by, bz));
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -2,17 +2,12 @@

 #pragma once

-#include <filesystem>
-#include <tuple>
 #include <vector>

 #include "mlx/array.h"

 namespace mlx::core {

-// Return the directory that contains current shared library.
-std::filesystem::path current_binary_dir();
-
 inline int64_t
 elem_to_loc(int elem, const Shape& shape, const Strides& strides) {
  int64_t loc = 0;
@@ -75,31 +70,6 @@ std::pair<Shape, Strides> collapse_contiguous_dims(
    const array& a,
    int64_t size_cap = std::numeric_limits<int32_t>::max());

-// Compute the thread block dimensions which fit the given
-// input dimensions.
-// - The thread block dimensions will be powers of two
-// - The thread block size will be less than 2^pow2
-using Dims = std::tuple<uint32_t, uint32_t, uint32_t>;
-Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 = 10);
-
-// Computes a 2D grid where each element is < UINT_MAX
-// Assumes:
-// - overall size (product of non-broadcasted dimensions) is < UINT_MAX^2
-// - shape and strides correspond to a contiguous (no holes) but
-//   possibly broadcasted array
-Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides);
-
-// Same as above but we do an implicit division with divisor.
-// Basically, equivalent to factorizing
-//    Prod(s \forall s in shape if strides[s] > 0) / divisor.
-Dims get_2d_grid_dims_common(
-    const Shape& shape,
-    const Strides& strides,
-    size_t divisor);
-
-// Get both the block and a grid of blocks that covers dim0, dim1 and dim2.
-std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2);
-
 struct ContiguousIterator {
  inline void step() {
    int dims = shape_.size();
@@ -162,6 +132,11 @@ struct ContiguousIterator {
 };

 inline auto check_contiguity(const Shape& shape, const Strides& strides) {
+  // Conditions for {row/col}_contiguous
+  // - cumulative product of shapes is equal to the strides (we can ignore axes
+  //   with size == 1)
+  //   - in the forward direction (column contiguous)
+  //   - in the reverse direction (row contiguous)
  size_t no_broadcast_data_size = 1;
  int64_t f_stride = 1;
  int64_t b_stride = 1;
@@ -195,11 +170,4 @@ void shared_buffer_reshape(
    const array& in,
    const Strides& out_strides,
    array& out);
-
-template <typename T>
-inline std::vector<T> remove_index(std::vector<T> vec, size_t index) {
-  vec.erase(std::next(vec.begin(), index));
-  return vec;
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cpu/CMakeLists.txt
+++ b/mlx/backend/cpu/CMakeLists.txt
@@ -46,7 +46,6 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/eig.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/encoder.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
--- a/mlx/backend/cpu/arg_reduce.cpp
+++ b/mlx/backend/cpu/arg_reduce.cpp
@@ -14,8 +14,10 @@ template <typename InT, typename OpT>
 void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto axis_size = in.shape()[axis];
  auto axis_stride = in.strides()[axis];
-  Strides strides = remove_index(in.strides(), axis);
-  Shape shape = remove_index(in.shape(), axis);
+  Strides strides = in.strides();
+  Shape shape = in.shape();
+  strides.erase(strides.begin() + axis);
+  shape.erase(shape.begin() + axis);
  auto in_ptr = in.data<InT>();
  auto out_ptr = out.data<uint32_t>();

--- a/mlx/backend/cpu/cholesky.cpp
+++ b/mlx/backend/cpu/cholesky.cpp
@@ -20,7 +20,7 @@ void cholesky_impl(const array& a, array& factor, bool upper, Stream stream) {

  // The decomposition is computed in place, so just copy the input to the
  // output.
-  copy_cpu(
+  copy(
      a,
      factor,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cpu/compiled.cpp
+++ b/mlx/backend/cpu/compiled.cpp
@@ -146,9 +146,18 @@ inline void build_kernel(
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    const std::vector<array>& tape,
-    const std::function<bool(size_t)>& is_constant,
+    const std::unordered_set<uintptr_t>& constant_ids,
    bool contiguous,
    int ndim) {
+  // All outputs should have the exact same shape and will be row contiguous
+  auto output_shape = outputs[0].shape();
+  auto output_strides = outputs[0].strides();
+
+  // Constants are scalars that are captured by value and cannot change
+  auto is_constant = [&constant_ids](const array& x) {
+    return constant_ids.find(x.id()) != constant_ids.end();
+  };
+
  NodeNamer namer;

 #ifdef _MSC_VER
@@ -161,15 +170,14 @@ inline void build_kernel(

  // Add the input arguments
  int cnt = 0;
-  for (size_t i = 0; i < inputs.size(); ++i) {
+  for (auto& x : inputs) {
+    auto& xname = namer.get_name(x);
+
    // Skip constants from the input list
-    if (is_constant(i)) {
+    if (is_constant(x)) {
      continue;
    }

-    const auto& x = inputs[i];
-    auto& xname = namer.get_name(x);
-
    auto tstr = get_type_string(x.dtype());
    os << "  " << tstr << "* " << xname << " = (" << tstr << "*)args[" << cnt++
       << "];" << std::endl;
@@ -203,11 +211,10 @@ inline void build_kernel(
  }

  // Read the inputs in tmps
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    const auto& x = inputs[i];
+  for (auto& x : inputs) {
    auto& xname = namer.get_name(x);

-    if (is_constant(i)) {
+    if (is_constant(x)) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = ";
      print_constant(os, x);
      os << ";" << std::endl;
@@ -231,7 +238,7 @@ inline void build_kernel(
      os << "static_cast<" << get_type_string(x.dtype()) << ">(tmp_"
         << namer.get_name(x.inputs()[0]) << ");" << std::endl;
    } else {
-      os << x.primitive().name();
+      x.primitive().print(os);
      os << "()(";
      for (int i = 0; i < x.inputs().size() - 1; i++) {
        os << "tmp_" << namer.get_name(x.inputs()[i]) << ", ";
@@ -257,9 +264,8 @@ inline void build_kernel(
  } else {
    for (int d = ndim - 1; d >= 0; --d) {
      // Update pointers
-      for (size_t i = 0; i < inputs.size(); ++i) {
-        const auto& x = inputs[i];
-        if (is_constant(i) || is_scalar(x)) {
+      for (auto& x : inputs) {
+        if (is_constant(x) || is_scalar(x)) {
          continue;
        }
        auto& xname = namer.get_name(x);
@@ -281,37 +287,65 @@ inline void build_kernel(
 void Compiled::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
+  if (kernel_lib_.empty()) {
+    kernel_lib_ = build_lib_name(inputs_, outputs_, tape_, constant_ids_);
+  }
+
+  // Figure out which kernel we are using
+  auto& shape = outputs[0].shape();
+  auto contiguous = compiled_check_contiguity(inputs, shape);
  auto& encoder = cpu::get_command_encoder(stream());

-  // Collapse contiguous dims to route to a faster kernel if possible. Also
-  // handle all broadcasting.
-  auto [contiguous, shape, strides] =
-      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);
-
-  // Collect function input arguments.
+  // Handle all broadcasting and collect function input arguments
  std::vector<void*> args;
-  int strides_index = 1;
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    if (is_constant_(i)) {
+  std::vector<std::vector<size_t>> strides;
+  for (int i = 0; i < inputs.size(); i++) {
+    // Skip constants.
+    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
      continue;
    }
-    const auto& x = inputs[i];
+    auto& x = inputs[i];
    encoder.set_input_array(x);
    args.push_back((void*)x.data<void>());
-    if (!contiguous && !is_scalar(x)) {
-      args.push_back(strides[strides_index++].data());
+
+    if (contiguous || is_scalar(x)) {
+      continue;
    }
+
+    // Broadcast the input to the output shape.
+    std::vector<size_t> xstrides;
+    int j = 0;
+    for (; j < shape.size() - x.ndim(); j++) {
+      if (shape[j] == 1) {
+        xstrides.push_back(outputs[0].strides()[j]);
+      } else {
+        xstrides.push_back(0);
+      }
+    }
+    for (int i = 0; i < x.ndim(); i++, j++) {
+      if (x.shape(i) == 1) {
+        if (shape[j] == 1) {
+          xstrides.push_back(outputs[0].strides()[j]);
+        } else {
+          xstrides.push_back(0);
+        }
+      } else {
+        xstrides.push_back(x.strides()[i]);
+      }
+    }
+    strides.push_back(std::move(xstrides));
+    args.push_back(strides.back().data());
  }

  // Get the kernel name from the lib
  int ndim = shape.size();
  auto kernel_name = kernel_lib_ + (contiguous ? "_contiguous" : "_strided_");
  if (!contiguous) {
-    kernel_name += std::to_string(ndim);
+    kernel_name += std::to_string(shape.size());
  }

  // Get the function
-  auto fn_ptr = compile(kernel_name, [&, contiguous = contiguous]() {
+  auto fn_ptr = compile(kernel_name, [&]() {
    std::ostringstream kernel;
    kernel << get_kernel_preamble() << std::endl;
    kernel << "extern \"C\"  {" << std::endl;
@@ -321,7 +355,7 @@ void Compiled::eval_cpu(
        inputs_,
        outputs_,
        tape_,
-        is_constant_,
+        constant_ids_,
        contiguous,
        ndim);
    // Close extern "C"
@@ -329,22 +363,26 @@ void Compiled::eval_cpu(
    return kernel.str();
  });

-  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);
+  compiled_allocate_outputs(
+      inputs, outputs, inputs_, constant_ids_, contiguous);

  for (auto& x : outputs) {
    args.push_back(x.data<void>());
    encoder.set_output_array(x);
  }
+  Shape out_shape;
  if (!contiguous) {
-    args.push_back((void*)shape.data());
+    out_shape = outputs[0].shape();
+    args.push_back((void*)out_shape.data());
  } else {
    args.push_back((void*)outputs[0].data_size());
  }
  auto fun = (void (*)(void**))fn_ptr;
-  encoder.dispatch([fun,
-                    args = std::move(args),
-                    strides = std::move(strides),
-                    shape = std::move(shape)]() mutable { fun(args.data()); });
+  encoder.dispatch(
+      [fun,
+       args = std::move(args),
+       strides = std::move(strides),
+       out_shape = std::move(out_shape)]() mutable { fun(args.data()); });
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/conv.cpp
+++ b/mlx/backend/cpu/conv.cpp
--- a/mlx/backend/cpu/copy.cpp
+++ b/mlx/backend/cpu/copy.cpp
@@ -295,11 +295,7 @@ inline void copy_inplace_dispatch(

 } // namespace

-void copy_cpu_inplace(
-    const array& src,
-    array& dst,
-    CopyType ctype,
-    Stream stream) {
+void copy_inplace(const array& src, array& dst, CopyType ctype, Stream stream) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(src);
  encoder.set_output_array(dst);
@@ -309,7 +305,7 @@ void copy_cpu_inplace(
       ctype]() mutable { copy_inplace_dispatch(src, dst, ctype); });
 }

-void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream) {
+void copy(const array& src, array& dst, CopyType ctype, Stream stream) {
  bool donated = set_copy_output_data(src, dst, ctype);
  if (donated && src.dtype() == dst.dtype()) {
    // If the output has the same type as the input then there is nothing to
@@ -319,10 +315,10 @@ void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream) {
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
-  copy_cpu_inplace(src, dst, ctype, stream);
+  copy_inplace(src, dst, ctype, stream);
 }

-void copy_cpu_inplace(
+void copy_inplace(
    const array& src,
    array& dst,
    const Shape& data_shape,
--- a/mlx/backend/cpu/copy.h
+++ b/mlx/backend/cpu/copy.h
@@ -10,14 +10,10 @@

 namespace mlx::core {

-void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream);
-void copy_cpu_inplace(
-    const array& src,
-    array& dst,
-    CopyType ctype,
-    Stream stream);
+void copy(const array& src, array& dst, CopyType ctype, Stream stream);
+void copy_inplace(const array& src, array& dst, CopyType ctype, Stream stream);

-void copy_cpu_inplace(
+void copy_inplace(
    const array& src,
    array& dst,
    const Shape& data_shape,
--- a/mlx/backend/cpu/distributed.cpp
+++ b/mlx/backend/cpu/distributed.cpp
@@ -14,7 +14,7 @@ std::pair<array, bool> ensure_row_contiguous(const array& arr, Stream stream) {
    return {arr, false};
  } else {
    array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-    copy_cpu(arr, arr_copy, CopyType::General, stream);
+    copy(arr, arr_copy, CopyType::General, stream);
    return {arr_copy, true};
  }
 };
@@ -35,7 +35,7 @@ void AllReduce::eval_cpu(
      return in;
    } else {
      array arr_copy(in.shape(), in.dtype(), nullptr, {});
-      copy_cpu(in, arr_copy, CopyType::General, s);
+      copy(in, arr_copy, CopyType::General, s);
      out.copy_shared_buffer(arr_copy);
      return arr_copy;
    }
--- a/mlx/backend/cpu/eig.cpp
+++ b/mlx/backend/cpu/eig.cpp
@@ -1,174 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/allocator.h"
-#include "mlx/array.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/encoder.h"
-#include "mlx/backend/cpu/lapack.h"
-#include "mlx/linalg.h"
-#include "mlx/primitives.h"
-
-namespace mlx::core {
-
-namespace {
-
-template <typename T>
-void eig_impl(
-    array& a,
-    array& vectors,
-    array& values,
-    bool compute_eigenvectors,
-    Stream stream) {
-  using OT = std::complex<T>;
-  auto a_ptr = a.data<T>();
-  auto eig_ptr = values.data<OT>();
-
-  auto& encoder = cpu::get_command_encoder(stream);
-  encoder.set_input_array(a);
-  encoder.set_output_array(values);
-  OT* vec_ptr = nullptr;
-  if (compute_eigenvectors) {
-    encoder.set_output_array(vectors);
-    vec_ptr = vectors.data<OT>();
-  }
-  encoder.dispatch([a_ptr,
-                    vec_ptr,
-                    eig_ptr,
-                    compute_eigenvectors,
-                    N = vectors.shape(-1),
-                    size = vectors.size()]() mutable {
-    // Work query
-    char jobr = 'N';
-    char jobl = compute_eigenvectors ? 'V' : 'N';
-    int n_vecs_r = 1;
-    int n_vecs_l = compute_eigenvectors ? N : 1;
-    int lwork = -1;
-    int info;
-    {
-      T work;
-      int iwork;
-      geev<T>(
-          &jobl,
-          &jobr,
-          &N,
-          nullptr,
-          &N,
-          nullptr,
-          nullptr,
-          nullptr,
-          &n_vecs_l,
-          nullptr,
-          &n_vecs_r,
-          &work,
-          &lwork,
-          &info);
-      lwork = static_cast<int>(work);
-    }
-
-    auto eig_tmp_data = array::Data{allocator::malloc(sizeof(T) * N * 2)};
-    auto vec_tmp_data =
-        array::Data{allocator::malloc(vec_ptr ? sizeof(T) * N * N * 2 : 0)};
-    auto eig_tmp = static_cast<T*>(eig_tmp_data.buffer.raw_ptr());
-    auto vec_tmp = static_cast<T*>(vec_tmp_data.buffer.raw_ptr());
-    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
-    for (size_t i = 0; i < size / (N * N); ++i) {
-      geev<T>(
-          &jobl,
-          &jobr,
-          &N,
-          a_ptr,
-          &N,
-          eig_tmp,
-          eig_tmp + N,
-          vec_tmp,
-          &n_vecs_l,
-          nullptr,
-          &n_vecs_r,
-          static_cast<T*>(work_buf.buffer.raw_ptr()),
-          &lwork,
-          &info);
-      for (int i = 0; i < N; ++i) {
-        eig_ptr[i] = {eig_tmp[i], eig_tmp[N + i]};
-      }
-      if (vec_ptr) {
-        for (int i = 0; i < N; ++i) {
-          if (eig_ptr[i].imag() != 0) {
-            // This vector and the next are a pair
-            for (int j = 0; j < N; ++j) {
-              vec_ptr[i * N + j] = {
-                  vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]};
-              vec_ptr[(i + 1) * N + j] = {
-                  vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]};
-            }
-            i += 1;
-          } else {
-            for (int j = 0; j < N; ++j) {
-              vec_ptr[i * N + j] = {vec_tmp[i * N + j], 0};
-            }
-          }
-        }
-        vec_ptr += N * N;
-      }
-      a_ptr += N * N;
-      eig_ptr += N;
-      if (info != 0) {
-        std::stringstream msg;
-        msg << "[Eig::eval_cpu] Eigenvalue decomposition failed with error code "
-            << info;
-        throw std::runtime_error(msg.str());
-      }
-    }
-  });
-  encoder.add_temporary(a);
-}
-
-} // namespace
-
-void Eig::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  const auto& a = inputs[0];
-  auto& values = outputs[0];
-
-  auto vectors = compute_eigenvectors_
-      ? outputs[1]
-      : array(a.shape(), complex64, nullptr, {});
-
-  auto a_copy = array(a.shape(), a.dtype(), nullptr, {});
-  copy_cpu(
-      a,
-      a_copy,
-      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
-      stream());
-
-  values.set_data(allocator::malloc(values.nbytes()));
-
-  if (compute_eigenvectors_) {
-    // Set the strides and flags so the eigenvectors
-    // are in the columns of the output
-    auto flags = vectors.flags();
-    auto strides = vectors.strides();
-    auto ndim = a.ndim();
-    std::swap(strides[ndim - 1], strides[ndim - 2]);
-
-    if (a.size() > 1) {
-      flags.row_contiguous = false;
-      if (ndim > 2) {
-        flags.col_contiguous = false;
-      } else {
-        flags.col_contiguous = true;
-      }
-    }
-    vectors.set_data(
-        allocator::malloc(vectors.nbytes()), vectors.size(), strides, flags);
-  }
-  switch (a.dtype()) {
-    case float32:
-      eig_impl<float>(a_copy, vectors, values, compute_eigenvectors_, stream());
-      break;
-    default:
-      throw std::runtime_error("[Eig::eval_cpu] only supports float32.");
-  }
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/eigh.cpp
+++ b/mlx/backend/cpu/eigh.cpp
@@ -12,133 +12,6 @@ namespace mlx::core {

 namespace {

-template <typename T, class Enable = void>
-struct EighWork {};
-
-template <typename T>
-struct EighWork<
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  using R = T;
-
-  char jobz;
-  char uplo;
-  int N;
-  int lwork;
-  int liwork;
-  int info;
-  std::vector<array::Data> buffers;
-
-  EighWork(char jobz_, char uplo_, int N_)
-      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), liwork(-1) {
-    T work;
-    int iwork;
-    syevd<T>(
-        &jobz,
-        &uplo,
-        &N,
-        nullptr,
-        &N,
-        nullptr,
-        &work,
-        &lwork,
-        &iwork,
-        &liwork,
-        &info);
-    lwork = static_cast<int>(work);
-    liwork = iwork;
-    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
-    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
-  }
-
-  void run(T* vectors, T* values) {
-    syevd<T>(
-        &jobz,
-        &uplo,
-        &N,
-        vectors,
-        &N,
-        values,
-        static_cast<T*>(buffers[0].buffer.raw_ptr()),
-        &lwork,
-        static_cast<int*>(buffers[1].buffer.raw_ptr()),
-        &liwork,
-        &info);
-  }
-};
-
-template <>
-struct EighWork<std::complex<float>> {
-  using T = std::complex<float>;
-  using R = float;
-
-  char jobz;
-  char uplo;
-  int N;
-  int lwork;
-  int lrwork;
-  int liwork;
-  int info;
-  std::vector<array::Data> buffers;
-
-  EighWork(char jobz_, char uplo_, int N_)
-      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), lrwork(-1), liwork(-1) {
-    T work;
-    R rwork;
-    int iwork;
-    heevd<T>(
-        &jobz,
-        &uplo,
-        &N,
-        nullptr,
-        &N,
-        nullptr,
-        &work,
-        &lwork,
-        &rwork,
-        &lrwork,
-        &iwork,
-        &liwork,
-        &info);
-    lwork = static_cast<int>(work.real());
-    lrwork = static_cast<int>(rwork);
-    liwork = iwork;
-    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
-    buffers.emplace_back(allocator::malloc(sizeof(R) * lrwork));
-    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
-  }
-
-  void run(T* vectors, R* values) {
-    heevd<T>(
-        &jobz,
-        &uplo,
-        &N,
-        vectors,
-        &N,
-        values,
-        static_cast<T*>(buffers[0].buffer.raw_ptr()),
-        &lwork,
-        static_cast<R*>(buffers[1].buffer.raw_ptr()),
-        &lrwork,
-        static_cast<int*>(buffers[2].buffer.raw_ptr()),
-        &liwork,
-        &info);
-    if (jobz == 'V') {
-      // We have pre-transposed the vectors but we also must conjugate them
-      // when they are complex.
-      //
-      // We could vectorize this but it is so fast in comparison to heevd that
-      // it doesn't really matter.
-      for (int i = 0; i < N; i++) {
-        for (int j = 0; j < N; j++) {
-          *vectors = std::conj(*vectors);
-          vectors++;
-        }
-      }
-    }
-  }
-};
-
 template <typename T>
 void eigh_impl(
    array& vectors,
@@ -146,10 +19,8 @@ void eigh_impl(
    const std::string& uplo,
    bool compute_eigenvectors,
    Stream stream) {
-  using R = typename EighWork<T>::R;
-
  auto vec_ptr = vectors.data<T>();
-  auto eig_ptr = values.data<R>();
+  auto eig_ptr = values.data<T>();
  char jobz = compute_eigenvectors ? 'V' : 'N';

  auto& encoder = cpu::get_command_encoder(stream);
@@ -162,17 +33,49 @@ void eigh_impl(
                    N = vectors.shape(-1),
                    size = vectors.size()]() mutable {
    // Work query
-    EighWork<T> work(jobz, uplo, N);
+    int lwork = -1;
+    int liwork = -1;
+    int info;
+    {
+      T work;
+      int iwork;
+      syevd<T>(
+          &jobz,
+          &uplo,
+          &N,
+          nullptr,
+          &N,
+          nullptr,
+          &work,
+          &lwork,
+          &iwork,
+          &liwork,
+          &info);
+      lwork = static_cast<int>(work);
+      liwork = iwork;
+    }

-    // Work loop
+    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
+    auto iwork_buf = array::Data{allocator::malloc(sizeof(int) * liwork)};
    for (size_t i = 0; i < size / (N * N); ++i) {
-      work.run(vec_ptr, eig_ptr);
+      syevd<T>(
+          &jobz,
+          &uplo,
+          &N,
+          vec_ptr,
+          &N,
+          eig_ptr,
+          static_cast<T*>(work_buf.buffer.raw_ptr()),
+          &lwork,
+          static_cast<int*>(iwork_buf.buffer.raw_ptr()),
+          &liwork,
+          &info);
      vec_ptr += N * N;
      eig_ptr += N;
-      if (work.info != 0) {
+      if (info != 0) {
        std::stringstream msg;
        msg << "[Eigh::eval_cpu] Eigenvalue decomposition failed with error code "
-            << work.info;
+            << info;
        throw std::runtime_error(msg.str());
      }
    }
@@ -196,7 +99,7 @@ void Eigh::eval_cpu(

  values.set_data(allocator::malloc(values.nbytes()));

-  copy_cpu(
+  copy(
      a,
      vectors,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
@@ -228,10 +131,6 @@ void Eigh::eval_cpu(
      eigh_impl<double>(
          vectors, values, uplo_, compute_eigenvectors_, stream());
      break;
-    case complex64:
-      eigh_impl<std::complex<float>>(
-          vectors, values, uplo_, compute_eigenvectors_, stream());
-      break;
    default:
      throw std::runtime_error(
          "[Eigh::eval_cpu] only supports float32 or float64.");
--- a/mlx/backend/cpu/hadamard.cpp
+++ b/mlx/backend/cpu/hadamard.cpp
@@ -96,7 +96,7 @@ void Hadamard::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (in.flags().row_contiguous && in.is_donatable()) {
    out.copy_shared_buffer(in);
  } else {
-    copy_cpu(
+    copy(
        in,
        out,
        in.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cpu/indexing.cpp
+++ b/mlx/backend/cpu/indexing.cpp
@@ -257,11 +257,15 @@ void gather_axis(
    const array& ind,
    array& out,
    const int axis) {
-  auto shape = remove_index(ind.shape(), axis);
-  ContiguousIterator ind_it(
-      shape, remove_index(ind.strides(), axis), src.ndim() - 1);
-  ContiguousIterator src_it(
-      shape, remove_index(src.strides(), axis), src.ndim() - 1);
+  auto strides = ind.strides();
+  strides.erase(strides.begin() + axis);
+  auto shape = ind.shape();
+  shape.erase(shape.begin() + axis);
+  ContiguousIterator ind_it(shape, strides, src.ndim() - 1);
+
+  strides = src.strides();
+  strides.erase(strides.begin() + axis);
+  ContiguousIterator src_it(shape, strides, src.ndim() - 1);

  auto ind_ptr = ind.data<IdxT>();
  auto src_ptr = src.data<T>();
@@ -517,7 +521,7 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  // Copy src into out (copy allocates memory for out)
  auto ctype =
      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy_cpu(src, out, ctype, stream());
+  copy(src, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  std::vector<array> inds;
@@ -581,11 +585,15 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {

 template <typename T, typename IdxT, typename OpT>
 void scatter_axis(array& out, const array idx, const array& upd, int axis) {
-  auto shape = remove_index(idx.shape(), axis);
-  ContiguousIterator idx_it(
-      shape, remove_index(idx.strides(), axis), upd.ndim() - 1);
-  ContiguousIterator upd_it(
-      shape, remove_index(upd.strides(), axis), upd.ndim() - 1);
+  auto strides = idx.strides();
+  strides.erase(strides.begin() + axis);
+  auto shape = idx.shape();
+  shape.erase(shape.begin() + axis);
+  ContiguousIterator idx_it(shape, strides, upd.ndim() - 1);
+
+  strides = upd.strides();
+  strides.erase(strides.begin() + axis);
+  ContiguousIterator upd_it(shape, strides, upd.ndim() - 1);

  auto idx_ptr = idx.data<IdxT>();
  auto upd_ptr = upd.data<T>();
@@ -686,7 +694,7 @@ void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
  // Copy src into out (copy allocates memory for out)
  auto ctype =
      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy_cpu(src, out, ctype, stream());
+  copy(src, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(idx);
--- a/mlx/backend/cpu/inverse.cpp
+++ b/mlx/backend/cpu/inverse.cpp
@@ -115,7 +115,7 @@ void inverse_impl(
  //   (A⁻¹)ᵀ = (Aᵀ)⁻¹

  // The inverse is computed in place, so just copy the input to the output.
-  copy_cpu(
+  copy(
      a,
      inv,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cpu/lapack.h
+++ b/mlx/backend/cpu/lapack.h
@@ -2,14 +2,14 @@

 #pragma once

+// Required for Visual Studio.
+// https://github.com/OpenMathLib/OpenBLAS/blob/develop/docs/install.md
+#ifdef _MSC_VER
 #include <complex>
 #define LAPACK_COMPLEX_CUSTOM
 #define lapack_complex_float std::complex<float>
 #define lapack_complex_double std::complex<double>
-#define lapack_complex_float_real(z) ((z).real())
-#define lapack_complex_float_imag(z) ((z).imag())
-#define lapack_complex_double_real(z) ((z).real())
-#define lapack_complex_double_imag(z) ((z).imag())
+#endif

 #ifdef MLX_USE_ACCELERATE
 #include <Accelerate/Accelerate.h>
@@ -32,7 +32,7 @@

 #endif

-#define INSTANTIATE_LAPACK_REAL(FUNC)                        \
+#define INSTANTIATE_LAPACK_TYPES(FUNC)                       \
  template <typename T, typename... Args>                    \
  void FUNC(Args... args) {                                  \
    if constexpr (std::is_same_v<T, float>) {                \
@@ -42,24 +42,11 @@
    }                                                        \
  }

-INSTANTIATE_LAPACK_REAL(geqrf)
-INSTANTIATE_LAPACK_REAL(orgqr)
-INSTANTIATE_LAPACK_REAL(syevd)
-INSTANTIATE_LAPACK_REAL(geev)
-INSTANTIATE_LAPACK_REAL(potrf)
-INSTANTIATE_LAPACK_REAL(gesvdx)
-INSTANTIATE_LAPACK_REAL(getrf)
-INSTANTIATE_LAPACK_REAL(getri)
-INSTANTIATE_LAPACK_REAL(trtri)
-
-#define INSTANTIATE_LAPACK_COMPLEX(FUNC)                            \
-  template <typename T, typename... Args>                           \
-  void FUNC(Args... args) {                                         \
-    if constexpr (std::is_same_v<T, std::complex<float>>) {         \
-      MLX_LAPACK_FUNC(c##FUNC)(std::forward<Args>(args)...);        \
-    } else if constexpr (std::is_same_v<T, std::complex<double>>) { \
-      MLX_LAPACK_FUNC(z##FUNC)(std::forward<Args>(args)...);        \
-    }                                                               \
-  }
-
-INSTANTIATE_LAPACK_COMPLEX(heevd)
+INSTANTIATE_LAPACK_TYPES(geqrf)
+INSTANTIATE_LAPACK_TYPES(orgqr)
+INSTANTIATE_LAPACK_TYPES(syevd)
+INSTANTIATE_LAPACK_TYPES(potrf)
+INSTANTIATE_LAPACK_TYPES(gesvdx)
+INSTANTIATE_LAPACK_TYPES(getrf)
+INSTANTIATE_LAPACK_TYPES(getri)
+INSTANTIATE_LAPACK_TYPES(trtri)
--- a/mlx/backend/cpu/logsumexp.cpp
+++ b/mlx/backend/cpu/logsumexp.cpp
@@ -88,7 +88,7 @@ void LogSumExp::eval_cpu(const std::vector<array>& inputs, array& out) {
      return x;
    } else {
      auto x_copy = array(x.shape(), x.dtype(), nullptr, {});
-      copy_cpu(x, x_copy, CopyType::General, s);
+      copy(x, x_copy, CopyType::General, s);
      encoder.add_temporary(x_copy);
      return x_copy;
    }
--- a/mlx/backend/cpu/luf.cpp
+++ b/mlx/backend/cpu/luf.cpp
@@ -31,7 +31,7 @@ void luf_impl(
  strides[ndim - 1] = M;
  strides[ndim - 2] = 1;
  lu.set_data(allocator::malloc(lu.nbytes()), lu.nbytes(), strides, flags);
-  copy_cpu_inplace(
+  copy_inplace(
      a,
      lu,
      a.shape(),
--- a/mlx/backend/cpu/masked_mm.cpp
+++ b/mlx/backend/cpu/masked_mm.cpp
@@ -6,7 +6,6 @@
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
-#include "mlx/backend/cpu/gemm.h"
 #include "mlx/backend/cpu/lapack.h"
 #include "mlx/primitives.h"

@@ -53,58 +52,6 @@ inline void mask_matrix(
  }
 }

-template <typename T>
-inline void segmented_mm(
-    const T* a,
-    const T* b,
-    const uint32_t* segments,
-    T* out,
-    bool a_transposed,
-    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    const Shape& a_shape,
-    const Strides& a_strides,
-    const Shape& b_shape,
-    const Strides& b_strides,
-    size_t num_segments,
-    const Shape& segments_shape,
-    const Strides& segments_strides) {
-  int ndim = a_shape.size();
-  Shape a_copy = a_shape;
-  Shape b_copy = b_shape;
-  int32_t M = a_copy[ndim - 2];
-  int32_t N = b_copy[ndim - 1];
-  for (int i = 0; i < num_segments; i++) {
-    uint32_t k_start =
-        segments[elem_to_loc(2 * i, segments_shape, segments_strides)];
-    uint32_t k_end =
-        segments[elem_to_loc(2 * i + 1, segments_shape, segments_strides)];
-    if (k_end <= k_start) {
-      std::fill_n(out + i * M * N, M * N, T(0));
-      continue;
-    }
-    a_copy[ndim - 1] = k_end - k_start;
-    b_copy[ndim - 2] = k_end - k_start;
-    matmul<T>(
-        a + k_start * a_strides[ndim - 1],
-        b + k_start * b_strides[ndim - 2],
-        out + i * M * N,
-        a_transposed,
-        b_transposed,
-        lda,
-        ldb,
-        N,
-        1.0,
-        0.0,
-        1,
-        a_copy,
-        a_strides,
-        b_copy,
-        b_strides);
-  }
-}
-
 } // namespace

 void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -124,20 +71,20 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
        if (!expand_all && stx == arr.shape(-1) && sty == 1) {
          if (do_copy) {
            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-            copy_cpu(arr, arr_copy, CopyType::Vector, s);
+            copy(arr, arr_copy, CopyType::Vector, s);
            return std::make_tuple(false, stx, arr_copy, true);
          }
          return std::make_tuple(false, stx, arr, false);
        } else if (!expand_all && stx == 1 && sty == arr.shape(-2)) {
          if (do_copy) {
            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-            copy_cpu(arr, arr_copy, CopyType::Vector, s);
+            copy(arr, arr_copy, CopyType::Vector, s);
            return std::make_tuple(true, sty, arr_copy, true);
          }
          return std::make_tuple(true, sty, arr, false);
        } else {
          array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-          copy_cpu(arr, arr_copy, CopyType::General, s);
+          copy(arr, arr_copy, CopyType::General, s);
          int64_t stx = arr.shape(-1);
          return std::make_tuple(false, stx, arr_copy, true);
        }
@@ -386,7 +333,7 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
      return std::make_tuple(true, sty, arr);
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy_cpu(arr, temps.back(), CopyType::General, s);
+      copy(arr, temps.back(), CopyType::General, s);
      int64_t stx = arr.shape(-1);
      return std::make_tuple(false, stx, temps.back());
    }
@@ -490,121 +437,4 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  encoder.add_temporaries(std::move(temps));
 }

-void SegmentedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
-  out.set_data(allocator::malloc(out.nbytes()));
-
-  auto& s = stream();
-  auto& encoder = cpu::get_command_encoder(stream());
-  auto check_transpose = [&s, &encoder](const array& x) {
-    auto stx = x.strides()[x.ndim() - 2];
-    auto sty = x.strides()[x.ndim() - 1];
-    if (stx == x.shape(-1) && sty == 1) {
-      return std::make_tuple(false, stx, x);
-    } else if (stx == 1 && sty == x.shape(-2)) {
-      return std::make_tuple(true, sty, x);
-    } else {
-      array xc(x.shape(), x.dtype(), nullptr, {});
-      copy_cpu(x, xc, CopyType::General, s);
-      encoder.add_temporary(xc);
-      int64_t stx = x.shape(-1);
-      return std::make_tuple(false, stx, xc);
-    }
-  };
-
-  auto [a_transposed, lda, a] = check_transpose(inputs[0]);
-  auto [b_transposed, ldb, b] = check_transpose(inputs[1]);
-  auto& segments = inputs[2];
-
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_input_array(segments);
-  encoder.set_output_array(out);
-  encoder.dispatch([a = array::unsafe_weak_copy(a),
-                    b = array::unsafe_weak_copy(b),
-                    segments = array::unsafe_weak_copy(segments),
-                    out_ptr = out.data<void>(),
-                    a_transposed = a_transposed,
-                    b_transposed = b_transposed,
-                    lda = lda,
-                    ldb = ldb]() {
-    switch (a.dtype()) {
-      case float64:
-        segmented_mm<double>(
-            a.data<double>(),
-            b.data<double>(),
-            segments.data<uint32_t>(),
-            static_cast<double*>(out_ptr),
-            a_transposed,
-            b_transposed,
-            lda,
-            ldb,
-            a.shape(),
-            a.strides(),
-            b.shape(),
-            b.strides(),
-            segments.size() / 2,
-            segments.shape(),
-            segments.strides());
-        break;
-      case float32:
-        segmented_mm<float>(
-            a.data<float>(),
-            b.data<float>(),
-            segments.data<uint32_t>(),
-            static_cast<float*>(out_ptr),
-            a_transposed,
-            b_transposed,
-            lda,
-            ldb,
-            a.shape(),
-            a.strides(),
-            b.shape(),
-            b.strides(),
-            segments.size() / 2,
-            segments.shape(),
-            segments.strides());
-        break;
-      case float16:
-        segmented_mm<float16_t>(
-            a.data<float16_t>(),
-            b.data<float16_t>(),
-            segments.data<uint32_t>(),
-            static_cast<float16_t*>(out_ptr),
-            a_transposed,
-            b_transposed,
-            lda,
-            ldb,
-            a.shape(),
-            a.strides(),
-            b.shape(),
-            b.strides(),
-            segments.size() / 2,
-            segments.shape(),
-            segments.strides());
-        break;
-      case bfloat16:
-        segmented_mm<bfloat16_t>(
-            a.data<bfloat16_t>(),
-            b.data<bfloat16_t>(),
-            segments.data<uint32_t>(),
-            static_cast<bfloat16_t*>(out_ptr),
-            a_transposed,
-            b_transposed,
-            lda,
-            ldb,
-            a.shape(),
-            a.strides(),
-            b.shape(),
-            b.strides(),
-            segments.size() / 2,
-            segments.shape(),
-            segments.strides());
-        break;
-      default:
-        throw std::invalid_argument(
-            "Segmented mm supports only real float types.");
-    }
-  });
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cpu/matmul.cpp
+++ b/mlx/backend/cpu/matmul.cpp
@@ -81,7 +81,7 @@ void matmul_general(
      return std::make_tuple(true, sty, arr);
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy_cpu(arr, temps.back(), CopyType::General, stream);
+      copy(arr, temps.back(), CopyType::General, stream);
      stx = arr.shape(-1);
      return std::make_tuple(false, stx, temps.back());
    }
@@ -132,20 +132,14 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
    throw std::runtime_error(
        "[AddMM::eval_cpu] Currently only supports float32.");
  }
-  if (out.size() == 0) {
-    out.set_data(allocator::malloc(out.nbytes()));
-    return;
-  }

  // Fill output with C
  auto& c = inputs[2];
  CopyType ctype = c.data_size() == 1
      ? CopyType::Scalar
      : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
-  copy_cpu(c, out, ctype, stream());
-  if (inputs[0].shape(-1) == 0) {
-    return;
-  }
+  copy(c, out, ctype, stream());
+
  matmul_general(inputs[0], inputs[1], out, stream(), alpha_, beta_);
 }

--- a/mlx/backend/cpu/primitives.cpp
+++ b/mlx/backend/cpu/primitives.cpp
@@ -22,7 +22,7 @@ void reshape(const array& in, array& out) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
  if (copy_necessary) {
    out.set_data(allocator::malloc(out.nbytes()));
-    copy_cpu_inplace(in, out, CopyType::General, out.primitive().stream());
+    copy_inplace(in, out, CopyType::General, out.primitive().stream());
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
@@ -175,7 +175,7 @@ void AsType::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
-  copy_cpu(in, out, ctype, stream());
+  copy(in, out, ctype, stream());
 }

 void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -198,7 +198,7 @@ void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
    size_t data_offset = strides[axis_] * sizes[i];
    out_slice.copy_shared_buffer(
        out, strides, flags, out_slice.size(), data_offset);
-    copy_cpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, stream());
+    copy_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, stream());
  }
 }

@@ -211,7 +211,7 @@ void Contiguous::eval_cpu(const std::vector<array>& inputs, array& out) {
       (allow_col_major_ && in.flags().col_contiguous))) {
    out.copy_shared_buffer(in);
  } else {
-    copy_cpu(in, out, CopyType::General, stream());
+    copy(in, out, CopyType::General, stream());
  }
 }

@@ -235,7 +235,7 @@ void Full::eval_cpu(const std::vector<array>& inputs, array& out) {
  } else {
    ctype = CopyType::General;
  }
-  copy_cpu(in, out, ctype, stream());
+  copy(in, out, ctype, stream());
 }

 void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -251,7 +251,7 @@ void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(val.dtype() == in.dtype() && in.dtype() == out.dtype());

  // Fill output with val
-  copy_cpu(val, out, CopyType::Scalar, stream());
+  copy(val, out, CopyType::Scalar, stream());

  // Find offset for start of input values
  size_t data_offset = 0;
@@ -266,7 +266,7 @@ void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
      out, out.strides(), out.flags(), out_slice.size(), data_offset);

  // Copy input values into the slice
-  copy_cpu_inplace(in, out_slice, CopyType::GeneralGeneral, stream());
+  copy_inplace(in, out_slice, CopyType::GeneralGeneral, stream());
 }

 void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -340,7 +340,7 @@ void DynamicSlice::eval_cpu(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));
  auto [in_offset, donated] =
      compute_dynamic_offset(inputs[1], in.strides(), axes_, stream());
-  copy_cpu_inplace(
+  copy_inplace(
      /* const array& src = */ in,
      /* array& dst = */ out,
      /* const Shape& data_shape = */ out.shape(),
@@ -372,11 +372,11 @@ void DynamicSliceUpdate::eval_cpu(
  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
-  copy_cpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());
+  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());

  auto [out_offset, donated] =
      compute_dynamic_offset(inputs[2], out.strides(), axes_, stream());
-  copy_cpu_inplace(
+  copy_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const std::vector<int>& data_shape = */ upd.shape(),
@@ -412,14 +412,14 @@ void SliceUpdate::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
-  copy_cpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());
+  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());

  // Calculate out strides, initial offset and if copy needs to be made
  auto [data_offset, out_strides] =
      prepare_slice(out, start_indices_, strides_);

  // Do copy
-  copy_cpu_inplace(
+  copy_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const std::vector<int>& data_shape = */ upd.shape(),
@@ -456,9 +456,9 @@ void View::eval_cpu(const std::vector<array>& inputs, array& out) {
    if (in.dtype() == bool_) {
      auto in_tmp = array(in.shape(), uint8, nullptr, {});
      in_tmp.copy_shared_buffer(in);
-      copy_cpu_inplace(in_tmp, tmp, CopyType::General, stream());
+      copy_inplace(in_tmp, tmp, CopyType::General, stream());
    } else {
-      copy_cpu_inplace(in, tmp, CopyType::General, stream());
+      copy_inplace(in, tmp, CopyType::General, stream());
    }

    auto flags = out.flags();
--- a/mlx/backend/cpu/qrf.cpp
+++ b/mlx/backend/cpu/qrf.cpp
@@ -26,7 +26,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
  strides[in.ndim() - 2] = 1;
  strides[in.ndim() - 1] = M;
  in.set_data(allocator::malloc(in.nbytes()), in.nbytes(), strides, flags);
-  copy_cpu_inplace(a, in, CopyType::GeneralGeneral, stream);
+  copy_inplace(a, in, CopyType::GeneralGeneral, stream);
  auto& encoder = cpu::get_command_encoder(stream);
  q.set_data(allocator::malloc(q.nbytes()));
  r.set_data(allocator::malloc(r.nbytes()));
--- a/mlx/backend/cpu/quantized.cpp
+++ b/mlx/backend/cpu/quantized.cpp
@@ -13,18 +13,9 @@ namespace mlx::core {

 namespace {

-inline constexpr short get_pack_factor(int bits, int wsize = 8) {
-  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
-}
-
-inline constexpr short get_bytes_per_pack(int bits, int wsize = 8) {
-  auto power_of_2_bits = (bits & (bits - 1)) == 0;
-  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
-}
-
 template <typename T, int bits>
 void extract_bits(const uint8_t* w_in, T* w_out) {
-  static_assert(bits == 3 || bits == 5 || bits == 6);
+  assert(bits == 3 || bits == 6);
  if (bits == 3) {
    w_out[0] = static_cast<T>(w_in[0] & 0x7);
    w_out[1] = static_cast<T>((w_in[0] & 0x38) >> 3);
@@ -34,16 +25,6 @@ void extract_bits(const uint8_t* w_in, T* w_out) {
    w_out[5] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0x3) << 1));
    w_out[6] = static_cast<T>((w_in[2] & 0x1c) >> 2);
    w_out[7] = static_cast<T>((w_in[2] & 0xe0) >> 5);
-  } else if (bits == 5) {
-    w_out[0] = static_cast<T>(w_in[0] & 0x1f);
-    w_out[1] = static_cast<T>(((w_in[0] & 0xe0) >> 5) + ((w_in[1] & 0x3) << 3));
-    w_out[2] = static_cast<T>((w_in[1] & 0x7c) >> 2);
-    w_out[3] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0xf) << 1));
-    w_out[4] = static_cast<T>(((w_in[2] & 0xf0) >> 4) + ((w_in[3] & 0x1) << 4));
-    w_out[5] = static_cast<T>((w_in[3] & 0x3e) >> 1);
-    w_out[6] = static_cast<T>(((w_in[3] & 0xc0) >> 6) + ((w_in[4] & 0x7) << 2));
-    w_out[7] = static_cast<T>((w_in[4] & 0xf8) >> 3);
-
  } else if (bits == 6) {
    w_out[0] = static_cast<T>(w_in[0] & 0x3f);
    w_out[1] =
@@ -65,8 +46,8 @@ void _qmm(
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
-  constexpr int pack_factor = get_pack_factor(bits, 8);
-  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
+  constexpr int pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
+  constexpr int bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
@@ -84,7 +65,7 @@ void _qmm(
        T scale = *scales_local++;
        T bias = *biases_local++;
        for (int ng = 0; ng < packs_in_group; ng++) {
-          if constexpr (bits == 3 || bits == 5 || bits == 6) {
+          if (bits == 3 || bits == 6) {
            T wl[pack_factor];
            extract_bits<T, bits>(w_local, wl);
 #pragma clang loop unroll(full)
@@ -123,9 +104,8 @@ void _qmm_t(
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
-
-  constexpr int pack_factor = get_pack_factor(bits, 8);
-  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
+  constexpr int pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
+  constexpr int bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
@@ -141,7 +121,7 @@ void _qmm_t(
        T bias = *biases_local++;

        for (int kw = 0; kw < packs_in_group; kw++) {
-          if constexpr (bits == 3 || bits == 5 || bits == 6) {
+          if (bits == 3 || bits == 6) {
            T wl[pack_factor];
            extract_bits<T, bits>(w_local, wl);
 #pragma clang loop unroll(full)
@@ -324,10 +304,6 @@ void _qmm_dispatch_typed(
      _qmm_dispatch_group<T, 4>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
      break;
-    case 5:
-      _qmm_dispatch_group<T, 5>(
-          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
-      break;
    case 6:
      _qmm_dispatch_group<T, 6>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
@@ -529,7 +505,7 @@ void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
      return arr;
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy_cpu(arr, temps.back(), CopyType::General, s);
+      copy(arr, temps.back(), CopyType::General, s);
      return temps.back();
    }
  };
@@ -579,7 +555,7 @@ void GatherQMM::eval_cpu(const std::vector<array>& inputs, array& out) {
      return arr;
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy_cpu(arr, temps.back(), CopyType::General, s);
+      copy(arr, temps.back(), CopyType::General, s);
      return temps.back();
    }
  };
@@ -637,8 +613,9 @@ void quantize(
  float eps = 1e-7;

  bool power_of_2_bits = is_power_of_2(bits);
-  int el_per_int = get_pack_factor(bits, 32);
-  int bytes_per_pack = get_bytes_per_pack(bits);
+  int el_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;
+  // For 3/6 bits we read 3 uint8s at a time instead of 1 uint32
+  int bytes_per_pack = power_of_2_bits ? 1 : 3;
  int int_per_group = group_size * bytes_per_pack / el_per_int;
  size_t n_groups = w_size / group_size;

@@ -663,21 +640,15 @@ void quantize(
    }
    size_t out_idx = i * int_per_group;
    for (int j = 0; j < int_per_group / bytes_per_pack; ++j) {
-      uint64_t out_el = 0;
+      uint32_t out_el = 0;
      for (int k = 0; k < el_per_int; ++k) {
        float w_el = w[w_idx + j * el_per_int + k];
        w_el = std::rint((w_el - bias) / scale);
        w_el = std::min(std::max(w_el, 0.0f), n_bins);
-        out_el |= static_cast<uint64_t>(w_el) << (k * bits);
+        out_el |= static_cast<uint32_t>(w_el) << (k * bits);
      }
      if (power_of_2_bits) {
        out[out_idx + j] = out_el;
-      } else if (bits == 5) {
-        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
-        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
-        out[out_idx + bytes_per_pack * j + 2] = (out_el & 0xff0000) >> 16;
-        out[out_idx + bytes_per_pack * j + 3] = (out_el & 0xff000000) >> 24;
-        out[out_idx + bytes_per_pack * j + 4] = (out_el & 0xff00000000) >> 32;
      } else {
        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
@@ -713,7 +684,7 @@ void fast::AffineQuantize::eval_cpu(
      return std::make_pair(arr, false);
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-      copy_cpu(arr, arr_copy, CopyType::General, s);
+      copy(arr, arr_copy, CopyType::General, s);
      return std::make_pair(arr_copy, true);
    }
  };
--- a/mlx/backend/cpu/reduce.cpp
+++ b/mlx/backend/cpu/reduce.cpp
@@ -325,15 +325,7 @@ struct MaxReduce {
  };

  template <int N, typename T>
-  std::enable_if_t<std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
-    return simd::max(x);
-  };
-
-  template <int N, typename T>
-  std::enable_if_t<!std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
-    if (simd::any(x != x)) {
-      return static_cast<T>(NAN);
-    }
+  T operator()(simd::Simd<T, N> x) {
    return simd::max(x);
  };
 };
@@ -350,15 +342,7 @@ struct MinReduce {
  };

  template <int N, typename T>
-  std::enable_if_t<std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
-    return simd::min(x);
-  };
-
-  template <int N, typename T>
-  std::enable_if_t<!std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
-    if (simd::any(x != x)) {
-      return static_cast<T>(NAN);
-    }
+  T operator()(simd::Simd<T, N> x) {
    return simd::min(x);
  };
 };
@@ -543,10 +527,10 @@ void Reduce::eval_cpu(const std::vector<array>& inputs, array& out) {
            reduce_dispatch_min_max<uint64_t>(in, out, reduce_type_, axes_);
            break;
          case int8:
-            reduce_dispatch_min_max<int8_t>(in, out, reduce_type_, axes_);
+            reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
            break;
          case int16:
-            reduce_dispatch_min_max<int16_t>(in, out, reduce_type_, axes_);
+            reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
            break;
          case int32:
            reduce_dispatch_min_max<int32_t>(in, out, reduce_type_, axes_);
--- a/mlx/backend/cpu/scan.cpp
+++ b/mlx/backend/cpu/scan.cpp
@@ -251,7 +251,7 @@ void Scan::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto in = inputs[0];
  if (!in.flags().row_contiguous) {
    array arr_copy(in.shape(), in.dtype(), nullptr, {});
-    copy_cpu(in, arr_copy, CopyType::General, stream());
+    copy(in, arr_copy, CopyType::General, stream());
    in = arr_copy;
    encoder.add_temporary(arr_copy);
  }
--- a/mlx/backend/cpu/softmax.cpp
+++ b/mlx/backend/cpu/softmax.cpp
@@ -132,7 +132,7 @@ void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
      return x;
    } else {
      array x_copy(x.shape(), x.dtype(), nullptr, {});
-      copy_cpu(x, x_copy, CopyType::General, s);
+      copy(x, x_copy, CopyType::General, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
--- a/mlx/backend/cpu/sort.cpp
+++ b/mlx/backend/cpu/sort.cpp
@@ -335,7 +335,7 @@ void Sort::eval_cpu(const std::vector<array>& inputs, array& out) {

  // Copy input to output
  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
-  copy_cpu(in, out, ctype, stream());
+  copy(in, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_output_array(out);
@@ -427,7 +427,7 @@ void Partition::eval_cpu(const std::vector<array>& inputs, array& out) {

  // Copy input to output
  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
-  copy_cpu(in, out, ctype, stream());
+  copy(in, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_output_array(out);
--- a/mlx/backend/cpu/svd.cpp
+++ b/mlx/backend/cpu/svd.cpp
@@ -31,7 +31,7 @@ void svd_impl(

  // lapack clobbers the input, so we have to make a copy.
  array in(a.shape(), a.dtype(), nullptr, {});
-  copy_cpu(
+  copy(
      a,
      in,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cpu/unary.h
+++ b/mlx/backend/cpu/unary.h
@@ -2,13 +2,32 @@

 #pragma once

-#include "mlx/backend/common/unary.h"
+#include "mlx/allocator.h"
+#include "mlx/array.h"
+#include "mlx/backend/common/utils.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"
 #include "mlx/utils.h"

 namespace mlx::core {

+void set_unary_output_data(const array& in, array& out) {
+  if (in.flags().contiguous) {
+    if (is_donatable(in, out)) {
+      out.copy_shared_buffer(in);
+    } else {
+      auto size = in.data_size();
+      out.set_data(
+          allocator::malloc(size * out.itemsize()),
+          size,
+          in.strides(),
+          in.flags());
+    }
+  } else {
+    out.set_data(allocator::malloc(out.nbytes()));
+  }
+}
+
 template <typename T, typename U = T, typename Op>
 void unary_op(const T* a, U* out, size_t shape, size_t stride) {
  for (size_t i = 0; i < shape; i += 1) {
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -1,96 +1,31 @@
 # Filename rules in cuda backend:
 #
 # * Use .cu/.cuh if code contains device code, and .cpp/.h if not.
-# * Device-only code should be put in device/ subdir.
-# * Files in device/ subdir should not include files outside.
+# * Device-only kernel code should be put in kernels/ subdir.
+# * Files in kernels/ subdir should not include files outside.
 target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_contiguous.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_dynamic.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_input.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/cuda.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/jit_module.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/layer_norm.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/random.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/all_reduce.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/col_reduce.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/init_reduce.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/row_reduce.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)

-target_compile_definitions(mlx PRIVATE MLX_USE_CUDA)
-
-# Embed kernel sources in binary for JIT compilation.
-file(
-  GLOB MLX_JIT_SOURCES
-  RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-  "${CMAKE_CURRENT_SOURCE_DIR}/device/*.h"
-  "${CMAKE_CURRENT_SOURCE_DIR}/device/*.cuh")
-string(JOIN ":" MLX_JIT_SOURCES_ARG ${MLX_JIT_SOURCES})
-add_custom_command(
-  OUTPUT gen/cuda_jit_sources.h
-  COMMAND
-    ${CMAKE_COMMAND} -DMLX_SOURCE_ROOT=${CMAKE_CURRENT_SOURCE_DIR}
-    -DMLX_JIT_SOURCES=${MLX_JIT_SOURCES_ARG} -P
-    "${CMAKE_CURRENT_SOURCE_DIR}/bin2h.cmake"
-  DEPENDS bin2h.cmake ${MLX_JIT_SOURCES})
-add_custom_target(cuda_jit_sources DEPENDS gen/cuda_jit_sources.h)
-add_dependencies(mlx cuda_jit_sources)
-target_include_directories(mlx PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/gen")
+target_compile_definitions(mlx PUBLIC MLX_USE_CUDA)

 # Enable defining device lambda functions.
 target_compile_options(mlx
                       PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>")

-# Enable calling host constexpr functions from device. This is needed because
-# the constexpr version of isnan is host only.
-target_compile_options(
-  mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>")
-
-# CUDA 12.8 emits warning #20280-D for copy kernels which is a false positive.
-# Explicitly pass this flag to suppress the warning, it is safe to set it to
-# true but the warning wouldn't be suppressed.
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
-  target_compile_options(
-    mlx
-    PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
-endif()
-
-# Suppress warning when building for compute capability 7 used by V100.
-target_compile_options(
-  mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--Wno-deprecated-gpu-targets>")
-
 # Compute capability 7 is required for synchronization between CPU/GPU with
 # managed memory. TODO: Add more architectures for potential performance gain.
 set(MLX_CUDA_ARCHITECTURES
-    "70;80"
+    "75;80"
    CACHE STRING "CUDA architectures")
 message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
@@ -101,7 +36,7 @@ FetchContent_Declare(
  cccl
  URL "https://github.com/NVIDIA/cccl/releases/download/v2.8.1/cccl-v2.8.1.zip")
 FetchContent_MakeAvailable(cccl)
-target_include_directories(mlx BEFORE PRIVATE "${cccl_SOURCE_DIR}/include")
+target_include_directories(mlx PRIVATE BEFORE "${cccl_SOURCE_DIR}/include")

 # Use fixed version of NVTX.
 FetchContent_Declare(
@@ -117,16 +52,6 @@ target_link_libraries(mlx PUBLIC $<BUILD_INTERFACE:nvtx3-cpp>)
 find_package(CUDAToolkit REQUIRED)
 target_include_directories(mlx PRIVATE ${CUDAToolkit_INCLUDE_DIRS})

-# Use cublasLt.
-target_link_libraries(mlx PRIVATE CUDA::cublasLt)
-
-# Use NVRTC and driver APIs.
-target_link_libraries(mlx PRIVATE CUDA::nvrtc CUDA::cuda_driver)
-
 # Suppress nvcc warnings on MLX headers.
 target_compile_options(mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe
                                   --diag_suppress=997>)
-
-# Install CCCL headers for JIT.
-install(DIRECTORY ${cccl_SOURCE_DIR}/include/cuda
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cccl)
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -3,11 +3,9 @@
 #include "mlx/backend/cuda/allocator.h"
 #include "mlx/backend/cuda/utils.h"
 #include "mlx/backend/cuda/worker.h"
-#include "mlx/utils.h"

 #include <cuda_runtime.h>
 #include <fmt/format.h>
-#include <unistd.h>

 #include <cassert>

@@ -15,59 +13,24 @@ namespace mlx::core {

 namespace cu {

-constexpr int page_size = 16384;
-
-CudaAllocator::CudaAllocator()
-    : buffer_cache_(
-          page_size,
-          [](CudaBuffer* buf) { return buf->size; },
-          [this](CudaBuffer* buf) {
-            cuda_free(buf->data);
-            delete buf;
-          }) {
+CudaAllocator::CudaAllocator() {
  // TODO: Set memory limit for multi-device.
  size_t free, total;
  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
  memory_limit_ = total * 0.8;
-  max_pool_size_ = memory_limit_;
 }

 Buffer CudaAllocator::malloc(size_t size) {
-  // Find available buffer from cache.
-  auto orig_size = size;
-  std::unique_lock lock(mutex_);
-  if (size < page_size) {
-    size = next_power_of_2(size);
-  } else {
-    size = page_size * ((size + page_size - 1) / page_size);
-  }
-
-  CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
-  if (!buf) {
-    // If we have a lot of memory pressure or are over the maximum cache size,
-    // try to reclaim memory from the cache.
-    size_t mem_required = get_active_memory() + get_cache_memory() + size;
-    if (mem_required >= memory_limit_) {
-      buffer_cache_.release_cached_buffers(mem_required - memory_limit_);
-    }
-
-    lock.unlock();
-    buf = new CudaBuffer{nullptr, size};
-    cudaError_t err = cudaMallocManaged(&buf->data, size);
-    if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
-      throw std::runtime_error(fmt::format(
-          "cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
-    }
-    lock.lock();
+  // TODO: Check memory limit.
+  auto* buf = new CudaBuffer{nullptr, size};
+  cudaError_t err = cudaMallocManaged(&buf->data, size);
+  if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
+    throw std::runtime_error(
+        fmt::format("cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
  }
+  std::lock_guard lock(mutex_);
  active_memory_ += size;
  peak_memory_ = std::max(active_memory_, peak_memory_);
-
-  // Maintain the cache below the requested limit.
-  if (get_cache_memory() > max_pool_size_) {
-    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
-  }
-
  return Buffer{buf};
 }

@@ -77,15 +40,26 @@ void CudaAllocator::free(Buffer buffer) {
    return;
  }

-  std::unique_lock lock(mutex_);
-  active_memory_ -= buf->size;
-  if (get_cache_memory() < max_pool_size_) {
-    buffer_cache_.recycle_to_cache(buf);
-  } else {
-    lock.unlock();
-    cuda_free(buf->data);
-    delete buf;
+  // If free() is called from a unregistered thread, reschedule the call to
+  // worker.
+  {
+    std::lock_guard lock(worker_mutex_);
+    if (allowed_threads_.count(std::this_thread::get_id()) == 0) {
+      if (!worker_) {
+        worker_.reset(new Worker);
+      }
+      worker_->add_task([buffer]() { allocator().free(buffer); });
+      worker_->end_batch();
+      worker_->commit();
+      return;
+    }
  }
+
+  size_t size = buf->size;
+  cudaFree(buf->data);
+  delete buf;
+  std::lock_guard lock(mutex_);
+  active_memory_ -= size;
 }

 size_t CudaAllocator::size(Buffer buffer) const {
@@ -101,24 +75,6 @@ void CudaAllocator::register_this_thread() {
  allowed_threads_.insert(std::this_thread::get_id());
 }

-void CudaAllocator::cuda_free(void* buf) {
-  // If cuda_free() is called from a unregistered thread, reschedule the call to
-  // worker.
-  {
-    std::lock_guard lock(worker_mutex_);
-    if (allowed_threads_.count(std::this_thread::get_id()) == 0) {
-      if (!worker_) {
-        worker_.reset(new Worker);
-      }
-      worker_->add_task([this, buf]() { this->cuda_free(buf); });
-      worker_->end_batch();
-      worker_->commit();
-      return;
-    }
-  }
-  cudaFree(buf);
-}
-
 size_t CudaAllocator::get_active_memory() const {
  return active_memory_;
 }
@@ -142,21 +98,6 @@ size_t CudaAllocator::set_memory_limit(size_t limit) {
  return limit;
 }

-size_t CudaAllocator::get_cache_memory() const {
-  return buffer_cache_.cache_size();
-}
-
-size_t CudaAllocator::set_cache_limit(size_t limit) {
-  std::lock_guard lk(mutex_);
-  std::swap(limit, max_pool_size_);
-  return limit;
-}
-
-void CudaAllocator::clear_cache() {
-  std::lock_guard lk(mutex_);
-  buffer_cache_.clear();
-}
-
 CudaAllocator& allocator() {
  // By creating the |allocator_| on heap, the destructor of CudaAllocator
  // will not be called on exit and buffers in the cache will be leaked. This
@@ -197,19 +138,17 @@ size_t set_memory_limit(size_t limit) {
 size_t get_memory_limit() {
  return cu::allocator().get_memory_limit();
 }
-size_t get_cache_memory() {
-  return cu::allocator().get_cache_memory();
-}
-size_t set_cache_limit(size_t limit) {
-  return cu::allocator().set_cache_limit(limit);
-}
-void clear_cache() {
-  cu::allocator().clear_cache();
-}

-// Not supported in CUDA.
+// TODO: Implement buffer cache.
+size_t get_cache_memory() {
+  return 0;
+}
+size_t set_cache_limit(size_t) {
+  return 0;
+}
 size_t set_wired_limit(size_t) {
  return 0;
 }
+void clear_cache() {}

 } // namespace mlx::core
--- a/mlx/backend/cuda/allocator.h
+++ b/mlx/backend/cuda/allocator.h
@@ -3,7 +3,6 @@
 #pragma once

 #include "mlx/allocator.h"
-#include "mlx/backend/common/buffer_cache.h"

 #include <mutex>
 #include <set>
@@ -34,17 +33,11 @@ class CudaAllocator : public allocator::Allocator {
  // buffers there would result in dead lock.
  void register_this_thread();

-  // Call cudaFree in the safe thread.
-  void cuda_free(void* buf);
-
  size_t get_active_memory() const;
  size_t get_peak_memory() const;
  void reset_peak_memory();
  size_t get_memory_limit();
  size_t set_memory_limit(size_t limit);
-  size_t get_cache_memory() const;
-  size_t set_cache_limit(size_t limit);
-  void clear_cache();

 private:
  CudaAllocator();
@@ -56,8 +49,6 @@ class CudaAllocator : public allocator::Allocator {

  std::mutex mutex_;
  size_t memory_limit_;
-  size_t max_pool_size_;
-  BufferCache<CudaBuffer> buffer_cache_;
  size_t active_memory_{0};
  size_t peak_memory_{0};
 };
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -1,182 +0,0 @@
-// Copyright © 2025 Apple Inc.
-#include "mlx/backend/common/utils.h"
-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/fp16_math.cuh"
-#include "mlx/backend/cuda/iterators/strided_iterator.cuh"
-#include "mlx/backend/cuda/kernel_utils.cuh"
-#include "mlx/dtype_utils.h"
-#include "mlx/primitives.h"
-
-#include <cooperative_groups.h>
-#include <nvtx3/nvtx3.hpp>
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_reduce.cuh>
-
-#include <cassert>
-
-namespace mlx::core {
-
-namespace cu {
-
-namespace cg = cooperative_groups;
-
-template <typename T>
-struct IndexValPair {
-  uint32_t index;
-  T val;
-};
-
-template <typename T>
-struct ArgMin {
-  constexpr __device__ T init() {
-    return Limits<T>::max();
-  }
-
-  __device__ IndexValPair<T> operator()(
-      const IndexValPair<T>& best,
-      const IndexValPair<T>& current) {
-    if (best.val > current.val ||
-        (best.val == current.val && best.index > current.index)) {
-      return current;
-    } else {
-      return best;
-    }
-  }
-
-  template <int N>
-  __device__ IndexValPair<T>
-  reduce_many(IndexValPair<T> best, T (&vals)[N], uint32_t offset) {
-    for (int i = 0; i < N; i++) {
-      if (vals[i] < best.val) {
-        best.val = vals[i];
-        best.index = offset + i;
-      }
-    }
-    return best;
-  }
-};
-
-template <typename T>
-struct ArgMax {
-  constexpr __device__ T init() {
-    return Limits<T>::min();
-  }
-
-  __device__ IndexValPair<T> operator()(
-      const IndexValPair<T>& best,
-      const IndexValPair<T>& current) {
-    if (best.val < current.val ||
-        (best.val == current.val && best.index > current.index)) {
-      return current;
-    } else {
-      return best;
-    }
-  }
-
-  template <int N>
-  __device__ IndexValPair<T>
-  reduce_many(IndexValPair<T> best, T (&vals)[N], uint32_t offset) {
-    for (int i = 0; i < N; i++) {
-      if (vals[i] > best.val) {
-        best.val = vals[i];
-        best.index = offset + i;
-      }
-    }
-    return best;
-  }
-};
-
-template <typename T, typename Op, int BLOCK_DIM, int N_READS = 4>
-__global__ void arg_reduce_general(
-    const T* in,
-    uint32_t* out,
-    size_t size,
-    const __grid_constant__ Shape shape,
-    const __grid_constant__ Strides in_strides,
-    const __grid_constant__ Strides out_strides,
-    int32_t ndim,
-    int64_t axis_stride,
-    int32_t axis_size) {
-  auto block = cg::this_thread_block();
-
-  int64_t index = cg::this_grid().block_rank();
-  if (index >= size) {
-    return;
-  }
-
-  int64_t in_idx = elem_to_loc(index, shape.data(), in_strides.data(), ndim);
-  int64_t out_idx = elem_to_loc(index, shape.data(), out_strides.data(), ndim);
-
-  Op op;
-  T init = op.init();
-  IndexValPair<T> best{0, init};
-
-  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
-    T vals[N_READS];
-    auto tid = r * BLOCK_DIM + block.thread_index().x;
-    cub::LoadDirectBlocked(
-        tid, strided_iterator(in + in_idx, axis_stride), vals, axis_size, init);
-    best = op.reduce_many(best, vals, tid * N_READS);
-  }
-
-  typedef cub::BlockReduce<IndexValPair<T>, BLOCK_DIM> BlockReduceT;
-  __shared__ typename BlockReduceT::TempStorage temp;
-
-  best = BlockReduceT(temp).Reduce(best, op);
-
-  if (block.thread_rank() == 0) {
-    out[out_idx] = best.index;
-  }
-}
-
-} // namespace cu
-
-void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("ArgReduce::eval_gpu");
-  assert(inputs.size() == 1);
-  auto& in = inputs[0];
-  out.set_data(allocator::malloc(out.nbytes()));
-  auto& s = stream();
-
-  // Prepare the shapes, strides and axis arguments.
-  Shape shape = remove_index(in.shape(), axis_);
-  Strides in_strides = remove_index(in.strides(), axis_);
-  Strides out_strides = out.ndim() == in.ndim()
-      ? remove_index(out.strides(), axis_)
-      : out.strides();
-  int64_t axis_stride = in.strides()[axis_];
-  int32_t axis_size = in.shape()[axis_];
-  int32_t ndim = shape.size();
-
-  // ArgReduce.
-  auto& encoder = cu::get_command_encoder(s);
-  encoder.set_input_array(in);
-  encoder.set_output_array(out);
-  dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
-    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-    constexpr uint32_t N_READS = 4;
-    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
-      auto kernel =
-          cu::arg_reduce_general<T, cu::ArgMax<T>, block_dim(), N_READS>;
-      if (reduce_type_ == ArgReduce::ArgMin) {
-        kernel = cu::arg_reduce_general<T, cu::ArgMin<T>, block_dim(), N_READS>;
-      }
-      encoder.add_kernel_node(
-          kernel,
-          num_blocks,
-          block_dim(),
-          in.data<T>(),
-          out.data<uint32_t>(),
-          out.size(),
-          const_param(shape),
-          const_param(in_strides),
-          const_param(out_strides),
-          ndim,
-          axis_stride,
-          axis_size);
-    });
-  });
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/bin2h.cmake
+++ b/mlx/backend/cuda/bin2h.cmake
@@ -1,150 +0,0 @@
-# Based on: https://github.com/sivachandran/cmake-bin2h
-#
-# Copyright 2020 Sivachandran Paramasivam
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-include(CMakeParseArguments)
-
-# Function to wrap a given string into multiple lines at the given column
-# position.
-#
-# Parameters:
-#
-# * VARIABLE - The name of the CMake variable holding the string.
-# * AT_COLUMN - The column position at which string will be wrapped.
-function(WRAP_STRING)
-  set(oneValueArgs VARIABLE AT_COLUMN)
-  cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
-
-  string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
-  math(EXPR offset "0")
-
-  while(stringLength GREATER 0)
-    if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
-      math(EXPR length "${WRAP_STRING_AT_COLUMN}")
-    else()
-      math(EXPR length "${stringLength}")
-    endif()
-
-    string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
-    set(lines "${lines}\n ${line}")
-
-    math(EXPR stringLength "${stringLength} - ${length}")
-    math(EXPR offset "${offset} + ${length}")
-  endwhile()
-
-  set(${WRAP_STRING_VARIABLE}
-      "${lines}"
-      PARENT_SCOPE)
-endfunction()
-
-# Function to embed contents of a file as byte array in C/C++ header file(.h).
-# The header file will contain a byte array and integer variable holding the
-# size of the array.
-#
-# Parameters:
-#
-# * SOURCE_FILES - The paths of source files whose contents will be embedded in
-#   the header file.
-# * VARIABLE_NAME - The name of the variable for the byte array. The string
-#   "_SIZE" will be append to this name and will be used a variable name for
-#   size variable.
-# * HEADER_FILE - The path of header file.
-# * APPEND - If specified appends to the header file instead of overwriting it
-# * HEADER_NAMESPACE - The namespace, where the array should be located in.
-# * NULL_TERMINATE - If specified a null byte(zero) will be append to the byte
-#   array.
-#
-# Usage:
-#
-# bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
-function(BIN2H)
-  set(options APPEND NULL_TERMINATE)
-  set(oneValueArgs VARIABLE_NAME HEADER_FILE HEADER_NAMESPACE)
-  set(multiValueArgs SOURCE_FILES)
-  cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}"
-                        "${multiValueArgs}" ${ARGN})
-
-  set(arrayDefinition "")
-  foreach(SOURCE_FILE IN LISTS BIN2H_SOURCE_FILES)
-    # get filename without extension
-    get_filename_component(FILE_NAME_WE ${SOURCE_FILE} NAME_WE)
-    # convert the filename to a valid C identifier
-    string(MAKE_C_IDENTIFIER "${FILE_NAME_WE}" VALID_FILE_NAME)
-
-    # reads source file contents as hex string
-    file(READ ${SOURCE_FILE} hexString HEX)
-
-    # append null
-    if(BIN2H_NULL_TERMINATE)
-      string(APPEND hexString "00")
-    endif()
-
-    # wraps the hex string into multiple lines
-    wrap_string(VARIABLE hexString AT_COLUMN 24)
-
-    # strip the © in source code
-    string(REGEX REPLACE "c2a9" "2020" arrayValues ${hexString})
-
-    string(REGEX REPLACE "([0-9a-f][0-9a-f])" " 0x\\1," arrayValues
-                         ${arrayValues})
-
-    # make a full variable name for the array
-    set(FULL_VARIABLE_NAME "${BIN2H_VARIABLE_NAME}_${VALID_FILE_NAME}")
-
-    # declares byte array and the length variables
-    string(APPEND arrayDefinition
-           "constexpr char ${FULL_VARIABLE_NAME}[] = {${arrayValues}\n};\n\n")
-  endforeach()
-
-  # add namespace wrapper if defined
-  if(DEFINED BIN2H_HEADER_NAMESPACE)
-    set(namespaceStart "namespace ${BIN2H_HEADER_NAMESPACE} {")
-    set(namespaceEnd "} // namespace ${BIN2H_HEADER_NAMESPACE}")
-    set(declarations "${namespaceStart}\n\n${arrayDefinition}${namespaceEnd}\n")
-  endif()
-
-  set(arrayIncludes "#pragma once")
-  string(PREPEND declarations "${arrayIncludes}\n\n")
-
-  if(BIN2H_APPEND)
-    file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
-  else()
-    file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
-  endif()
-endfunction()
-
-# ----------------------------- CLI args -----------------------------
-
-string(REPLACE ":" ";" MLX_JIT_SOURCES_LIST ${MLX_JIT_SOURCES})
-foreach(source ${MLX_JIT_SOURCES_LIST})
-  list(APPEND MLX_JIT_SOURCES_ABS "${MLX_SOURCE_ROOT}/${source}")
-endforeach()
-
-bin2h(
-  SOURCE_FILES
-  ${MLX_JIT_SOURCES_ABS}
-  NULL_TERMINATE
-  VARIABLE_NAME
-  "jit_source"
-  HEADER_NAMESPACE
-  "mlx::core"
-  HEADER_FILE
-  "${CMAKE_CURRENT_BINARY_DIR}/gen/cuda_jit_sources.h")
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -1,359 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/common/binary.h"
-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/binary_ops.cuh"
-#include "mlx/backend/cuda/kernel_utils.cuh"
-#include "mlx/dtype_utils.h"
-#include "mlx/primitives.h"
-
-#include <cooperative_groups.h>
-#include <nvtx3/nvtx3.hpp>
-
-namespace mlx::core {
-
-namespace cu {
-
-namespace cg = cooperative_groups;
-
-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
-__global__ void binary_ss(const In* a, const In* b, Out* out, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-
-  if ((index + 1) * N_READS > size) {
-    for (int i = index * N_READS; i < size; ++i) {
-      out[i] = Op{}(a[0], b[0]);
-    }
-  } else {
-    AlignedVector<Out, N_READS> out_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      out_vec.val[i] = Op{}(a[0], b[0]);
-    }
-
-    store_vector<N_READS>(out, index, out_vec);
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
-__global__ void binary_sv(const In* a, const In* b, Out* out, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-
-  if ((index + 1) * N_READS > size) {
-    for (IdxT i = index * N_READS; i < size; ++i) {
-      out[i] = Op{}(a[0], b[i]);
-    }
-  } else {
-    auto b_vec = load_vector<N_READS>(b, index);
-
-    AlignedVector<Out, N_READS> out_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      out_vec.val[i] = Op{}(a[0], b_vec.val[i]);
-    }
-
-    store_vector<N_READS>(out, index, out_vec);
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
-__global__ void binary_vs(const In* a, const In* b, Out* out, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-
-  if ((index + 1) * N_READS > size) {
-    for (IdxT i = index * N_READS; i < size; ++i) {
-      out[i] = Op{}(a[i], b[0]);
-    }
-  } else {
-    auto a_vec = load_vector<N_READS>(a, index);
-
-    AlignedVector<Out, N_READS> out_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      out_vec.val[i] = Op{}(a_vec.val[i], b[0]);
-    }
-
-    store_vector<N_READS>(out, index, out_vec);
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
-__global__ void binary_vv(const In* a, const In* b, Out* out, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-
-  if ((index + 1) * N_READS > size) {
-    for (IdxT i = index * N_READS; i < size; ++i) {
-      out[i] = Op{}(a[i], b[i]);
-    }
-  } else {
-    auto a_vec = load_vector<N_READS>(a, index);
-    auto b_vec = load_vector<N_READS>(b, index);
-
-    AlignedVector<Out, N_READS> out_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      out_vec.val[i] = Op{}(a_vec.val[i], b_vec.val[i]);
-    }
-
-    store_vector<N_READS>(out, index, out_vec);
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
-__global__ void binary_g_nd(
-    const In* a,
-    const In* b,
-    Out* out,
-    IdxT size,
-    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
-        index, shape.data(), a_strides.data(), b_strides.data());
-    out[index] = Op{}(a[a_idx], b[b_idx]);
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT>
-__global__ void binary_g(
-    const In* a,
-    const In* b,
-    Out* out,
-    IdxT size,
-    const __grid_constant__ Shape shape,
-    const __grid_constant__ Strides a_strides,
-    const __grid_constant__ Strides b_strides,
-    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc_4d(
-        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
-    out[index] = Op{}(a[a_idx], b[b_idx]);
-  }
-}
-
-template <typename Op, typename In, typename Out>
-constexpr bool supports_binary_op() {
-  if (std::is_same_v<Op, Add> || std::is_same_v<Op, Divide> ||
-      std::is_same_v<Op, Maximum> || std::is_same_v<Op, Minimum> ||
-      std::is_same_v<Op, Multiply> || std::is_same_v<Op, Subtract> ||
-      std::is_same_v<Op, Power> || std::is_same_v<Op, Remainder>) {
-    return std::is_same_v<In, Out>;
-  }
-  if (std::is_same_v<Op, Equal> || std::is_same_v<Op, Greater> ||
-      std::is_same_v<Op, GreaterEqual> || std::is_same_v<Op, Less> ||
-      std::is_same_v<Op, LessEqual> || std::is_same_v<Op, NotEqual>) {
-    return std::is_same_v<Out, bool>;
-  }
-  if (std::is_same_v<Op, LogicalAnd> || std::is_same_v<Op, LogicalOr>) {
-    return std::is_same_v<Out, bool> && std::is_same_v<In, bool>;
-  }
-  if (std::is_same_v<Op, NaNEqual>) {
-    return std::is_same_v<Out, bool> && is_inexact_v<In>;
-  }
-  if (std::is_same_v<Op, LogAddExp>) {
-    return std::is_same_v<In, Out> && is_inexact_v<In>;
-  }
-  if (std::is_same_v<Op, ArcTan2>) {
-    return std::is_same_v<In, Out> && is_floating_v<In>;
-  }
-  if (std::is_same_v<Op, BitwiseAnd> || std::is_same_v<Op, BitwiseOr> ||
-      std::is_same_v<Op, BitwiseXor>) {
-    return std::is_same_v<In, Out> && std::is_integral_v<In>;
-  }
-  if (std::is_same_v<Op, LeftShift> || std::is_same_v<Op, RightShift>) {
-    return std::is_same_v<In, Out> && std::is_integral_v<In> &&
-        !std::is_same_v<In, bool>;
-  }
-  return false;
-}
-
-} // namespace cu
-
-template <typename Op>
-void binary_op_gpu_inplace(
-    const std::vector<array>& inputs,
-    array& out,
-    const char* op,
-    const Stream& s) {
-  assert(inputs.size() > 1);
-  const auto& a = inputs[0];
-  const auto& b = inputs[1];
-  if (out.size() == 0) {
-    return;
-  }
-
-  auto& encoder = cu::get_command_encoder(s);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
-      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
-      if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
-        using InType = cuda_type_t<CTYPE_IN>;
-        using OutType = cuda_type_t<CTYPE_OUT>;
-        auto bopt = get_binary_op_type(a, b);
-        if (bopt == BinaryOpType::General) {
-          dispatch_bool(
-              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
-                  out.data_size() > INT32_MAX,
-              [&](auto large) {
-                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-                Shape shape;
-                std::vector<Strides> strides;
-                std::tie(shape, strides) = collapse_contiguous_dims(a, b, out);
-                auto& a_strides = strides[0];
-                auto& b_strides = strides[1];
-                int ndim = shape.size();
-                if (ndim <= 3) {
-                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                    auto kernel = cu::
-                        binary_g_nd<Op, InType, OutType, IdxT, dims_constant()>;
-                    auto [num_blocks, block_dims] =
-                        get_launch_args(kernel, out, large());
-                    encoder.add_kernel_node(
-                        kernel,
-                        num_blocks,
-                        block_dims,
-                        a.data<InType>(),
-                        b.data<InType>(),
-                        out.data<OutType>(),
-                        out.size(),
-                        const_param<dims_constant()>(shape),
-                        const_param<dims_constant()>(a_strides),
-                        const_param<dims_constant()>(b_strides));
-                  });
-                } else {
-                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out, large());
-                  encoder.add_kernel_node(
-                      kernel,
-                      num_blocks,
-                      block_dims,
-                      a.data<InType>(),
-                      b.data<InType>(),
-                      out.data<OutType>(),
-                      out.size(),
-                      const_param(shape),
-                      const_param(a_strides),
-                      const_param(b_strides),
-                      ndim);
-                }
-              });
-        } else {
-          dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
-            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-            // TODO: Choose optimized value based on type size.
-            constexpr int N_READS = 4;
-            auto kernel = cu::binary_ss<Op, InType, OutType, IdxT, N_READS>;
-            if (bopt == BinaryOpType::ScalarVector) {
-              kernel = cu::binary_sv<Op, InType, OutType, IdxT, N_READS>;
-            } else if (bopt == BinaryOpType::VectorScalar) {
-              kernel = cu::binary_vs<Op, InType, OutType, IdxT, N_READS>;
-            } else if (bopt == BinaryOpType::VectorVector) {
-              kernel = cu::binary_vv<Op, InType, OutType, IdxT, N_READS>;
-            }
-            auto [num_blocks, block_dims] = get_launch_args(
-                kernel,
-                out.data_size(),
-                out.shape(),
-                out.strides(),
-                large(),
-                N_READS);
-            encoder.add_kernel_node(
-                kernel,
-                num_blocks,
-                block_dims,
-                a.data<InType>(),
-                b.data<InType>(),
-                out.data<OutType>(),
-                out.data_size());
-          });
-        }
-      } else {
-        throw std::runtime_error(fmt::format(
-            "Can not do binary op {} on inputs of {} with result of {}.",
-            op,
-            dtype_to_string(a.dtype()),
-            dtype_to_string(out.dtype())));
-      }
-    });
-  });
-}
-
-template <typename Op>
-void binary_op_gpu(
-    const std::vector<array>& inputs,
-    array& out,
-    const char* op,
-    const Stream& s) {
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-  binary_op_gpu_inplace<Op>(inputs, out, op, s);
-}
-
-#define BINARY_GPU(func)                                              \
-  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
-    nvtx3::scoped_range r(#func "::eval_gpu");                        \
-    auto& s = out.primitive().stream();                               \
-    binary_op_gpu<cu::func>(inputs, out, name(), s);                  \
-  }
-
-BINARY_GPU(Add)
-BINARY_GPU(ArcTan2)
-BINARY_GPU(Divide)
-BINARY_GPU(Remainder)
-BINARY_GPU(Greater)
-BINARY_GPU(GreaterEqual)
-BINARY_GPU(Less)
-BINARY_GPU(LessEqual)
-BINARY_GPU(LogicalAnd)
-BINARY_GPU(LogicalOr)
-BINARY_GPU(LogAddExp)
-BINARY_GPU(Maximum)
-BINARY_GPU(Minimum)
-BINARY_GPU(Multiply)
-BINARY_GPU(NotEqual)
-BINARY_GPU(Power)
-BINARY_GPU(Subtract)
-
-void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("Equal::eval_gpu");
-  auto& s = out.primitive().stream();
-  if (equal_nan_) {
-    binary_op_gpu<cu::NaNEqual>(inputs, out, name(), s);
-  } else {
-    binary_op_gpu<cu::Equal>(inputs, out, name(), s);
-  }
-}
-
-void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
-  auto& s = out.primitive().stream();
-  switch (op_) {
-    case BitwiseBinary::And:
-      binary_op_gpu<cu::BitwiseAnd>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::Or:
-      binary_op_gpu<cu::BitwiseOr>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::Xor:
-      binary_op_gpu<cu::BitwiseXor>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::LeftShift:
-      binary_op_gpu<cu::LeftShift>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::RightShift:
-      binary_op_gpu<cu::RightShift>(inputs, out, name(), s);
-      break;
-  }
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -1,334 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/common/binary.h"
-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/binary_ops.cuh"
-#include "mlx/backend/cuda/kernel_utils.cuh"
-#include "mlx/dtype_utils.h"
-#include "mlx/primitives.h"
-
-#include <cooperative_groups.h>
-#include <nvtx3/nvtx3.hpp>
-
-namespace mlx::core {
-
-namespace cu {
-
-namespace cg = cooperative_groups;
-
-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
-__global__ void
-binary_two_ss(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-
-  if ((index + 1) * N_READS > size) {
-    for (IdxT i = index * N_READS; i < size; ++i) {
-      auto out = Op{}(a[0], b[0]);
-      out_a[i] = out[0];
-      out_b[i] = out[1];
-    }
-  } else {
-    AlignedVector<Out, N_READS> out_a_vec;
-    AlignedVector<Out, N_READS> out_b_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      auto out = Op{}(a[0], b[0]);
-      out_a_vec.val[i] = out[0];
-      out_b_vec.val[i] = out[1];
-    }
-
-    store_vector<N_READS>(out_a, index, out_a_vec);
-    store_vector<N_READS>(out_b, index, out_b_vec);
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
-__global__ void
-binary_two_sv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-
-  if ((index + 1) * N_READS > size) {
-    for (IdxT i = index * N_READS; i < size; ++i) {
-      auto out = Op{}(a[0], b[i]);
-      out_a[i] = out[0];
-      out_b[i] = out[1];
-    }
-  } else {
-    auto b_vec = load_vector<N_READS>(b, index);
-
-    AlignedVector<Out, N_READS> out_a_vec;
-    AlignedVector<Out, N_READS> out_b_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      auto out = Op{}(a[0], b_vec.val[i]);
-      out_a_vec.val[i] = out[0];
-      out_b_vec.val[i] = out[1];
-    }
-
-    store_vector<N_READS>(out_a, index, out_a_vec);
-    store_vector<N_READS>(out_b, index, out_b_vec);
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
-__global__ void
-binary_two_vs(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-
-  if ((index + 1) * N_READS > size) {
-    for (IdxT i = index * N_READS; i < size; ++i) {
-      auto out = Op{}(a[i], b[0]);
-      out_a[i] = out[0];
-      out_b[i] = out[1];
-    }
-  } else {
-    auto a_vec = load_vector<N_READS>(a, index);
-
-    AlignedVector<Out, N_READS> out_a_vec;
-    AlignedVector<Out, N_READS> out_b_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      auto out = Op{}(a_vec.val[i], b[0]);
-      out_a_vec.val[i] = out[0];
-      out_b_vec.val[i] = out[1];
-    }
-
-    store_vector<N_READS>(out_a, index, out_a_vec);
-    store_vector<N_READS>(out_b, index, out_b_vec);
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
-__global__ void
-binary_two_vv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-
-  if ((index + 1) * N_READS > size) {
-    for (IdxT i = index * N_READS; i < size; ++i) {
-      auto out = Op{}(a[i], b[i]);
-      out_a[i] = out[0];
-      out_b[i] = out[1];
-    }
-  } else {
-    auto a_vec = load_vector<N_READS>(a, index);
-    auto b_vec = load_vector<N_READS>(b, index);
-
-    AlignedVector<Out, N_READS> out_a_vec;
-    AlignedVector<Out, N_READS> out_b_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      auto out = Op{}(a_vec.val[i], b_vec.val[i]);
-      out_a_vec.val[i] = out[0];
-      out_b_vec.val[i] = out[1];
-    }
-
-    store_vector<N_READS>(out_a, index, out_a_vec);
-    store_vector<N_READS>(out_b, index, out_b_vec);
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
-__global__ void binary_two_g_nd(
-    const In* a,
-    const In* b,
-    Out* out_a,
-    Out* out_b,
-    IdxT size,
-    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
-        index, shape.data(), a_strides.data(), b_strides.data());
-    auto out = Op{}(a[a_idx], b[b_idx]);
-    out_a[index] = out[0];
-    out_b[index] = out[1];
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT>
-__global__ void binary_two_g(
-    const In* a,
-    const In* b,
-    Out* out_a,
-    Out* out_b,
-    IdxT size,
-    const __grid_constant__ Shape shape,
-    const __grid_constant__ Strides a_strides,
-    const __grid_constant__ Strides b_strides,
-    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc_4d(
-        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
-    auto out = Op{}(a[a_idx], b[b_idx]);
-    out_a[index] = out[0];
-    out_b[index] = out[1];
-  }
-}
-
-template <typename Op, typename In, typename Out>
-constexpr bool supports_binary_two_op() {
-  if (std::is_same_v<Op, DivMod>) {
-    return std::is_same_v<In, Out> &&
-        (std::is_integral_v<Out> || is_floating_v<Out>);
-  }
-  return false;
-}
-
-} // namespace cu
-
-template <typename Op>
-void binary_two_op_gpu_inplace(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs,
-    const char* op,
-    const Stream& s) {
-  assert(inputs.size() > 1);
-  const auto& a = inputs[0];
-  const auto& b = inputs[1];
-  auto& out_a = outputs[0];
-  auto& out_b = outputs[1];
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out_a, bopt);
-  set_binary_op_output_data(a, b, out_b, bopt);
-
-  if (out_a.size() == 0) {
-    return;
-  }
-
-  auto& encoder = cu::get_command_encoder(s);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out_a);
-  encoder.set_output_array(out_b);
-  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out_a.dtype(), [&](auto out_type_tag) {
-      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
-      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
-      if constexpr (cu::supports_binary_two_op<Op, CTYPE_IN, CTYPE_OUT>()) {
-        using InType = cuda_type_t<CTYPE_IN>;
-        using OutType = cuda_type_t<CTYPE_OUT>;
-
-        auto bopt = get_binary_op_type(a, b);
-        if (bopt == BinaryOpType::General) {
-          dispatch_bool(
-              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
-                  out_a.data_size() > INT32_MAX,
-              [&](auto large) {
-                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-                Shape shape;
-                std::vector<Strides> strides;
-                std::tie(shape, strides) =
-                    collapse_contiguous_dims(a, b, out_a);
-                auto& a_strides = strides[0];
-                auto& b_strides = strides[1];
-                int ndim = shape.size();
-                if (ndim <= 3) {
-                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                    auto kernel = cu::binary_two_g_nd<
-                        Op,
-                        InType,
-                        OutType,
-                        IdxT,
-                        dims_constant()>;
-                    auto [num_blocks, block_dims] =
-                        get_launch_args(kernel, out_a, large());
-                    encoder.add_kernel_node(
-                        kernel,
-                        num_blocks,
-                        block_dims,
-                        a.data<InType>(),
-                        b.data<InType>(),
-                        out_a.data<OutType>(),
-                        out_b.data<OutType>(),
-                        out_a.size(),
-                        const_param<dims_constant()>(shape),
-                        const_param<dims_constant()>(a_strides),
-                        const_param<dims_constant()>(b_strides));
-                  });
-                } else {
-                  auto kernel = cu::binary_two_g<Op, InType, OutType, IdxT>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out_a, large());
-                  encoder.add_kernel_node(
-                      kernel,
-                      num_blocks,
-                      block_dims,
-                      a.data<InType>(),
-                      b.data<InType>(),
-                      out_a.data<OutType>(),
-                      out_b.data<OutType>(),
-                      out_a.size(),
-                      const_param(shape),
-                      const_param(a_strides),
-                      const_param(b_strides),
-                      ndim);
-                }
-              });
-        } else {
-          dispatch_bool(out_a.data_size() > UINT32_MAX, [&](auto large) {
-            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-            // TODO: Choose optimized value based on type size.
-            constexpr int N_READS = 4;
-            auto kernel = cu::binary_two_ss<Op, InType, OutType, IdxT, N_READS>;
-            if (bopt == BinaryOpType::ScalarVector) {
-              kernel = cu::binary_two_sv<Op, InType, OutType, IdxT, N_READS>;
-            } else if (bopt == BinaryOpType::VectorScalar) {
-              kernel = cu::binary_two_vs<Op, InType, OutType, IdxT, N_READS>;
-            } else if (bopt == BinaryOpType::VectorVector) {
-              kernel = cu::binary_two_vv<Op, InType, OutType, IdxT, N_READS>;
-            }
-            auto [num_blocks, block_dims] = get_launch_args(
-                kernel,
-                out_a.data_size(),
-                out_a.shape(),
-                out_a.strides(),
-                large(),
-                N_READS);
-            encoder.add_kernel_node(
-                kernel,
-                num_blocks,
-                block_dims,
-                a.data<InType>(),
-                b.data<InType>(),
-                out_a.data<OutType>(),
-                out_b.data<OutType>(),
-                out_a.data_size());
-          });
-        }
-      } else {
-        throw std::runtime_error(fmt::format(
-            "Can not do binary op {} on inputs of {} with result of {}.",
-            op,
-            dtype_to_string(a.dtype()),
-            dtype_to_string(out_a.dtype())));
-      }
-    });
-  });
-}
-
-template <typename Op>
-void binary_two_op_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs,
-    const char* op,
-    const Stream& s) {
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, outputs[0], bopt);
-  set_binary_op_output_data(a, b, outputs[1], bopt);
-  binary_two_op_gpu_inplace<Op>(inputs, outputs, op, s);
-}
-
-void DivMod::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  nvtx3::scoped_range r("DivMod::eval_gpu");
-  auto& s = outputs[0].primitive().stream();
-  binary_two_op_gpu<cu::DivMod>(inputs, outputs, name(), s);
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -1,231 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/common/compiled.h"
-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/jit_module.h"
-#include "mlx/backend/cuda/kernel_utils.cuh"
-#include "mlx/graph_utils.h"
-#include "mlx/primitives.h"
-
-#include <fmt/format.h>
-#include <nvtx3/nvtx3.hpp>
-
-namespace mlx::core {
-
-namespace cu {
-
-struct FusedKernelBuilder {
-  std::string os;
-  const std::string& kernel_name;
-  const std::vector<array>& inputs;
-  const std::vector<array>& outputs;
-  const std::vector<array>& tape;
-  const std::function<bool(size_t)>& is_constant;
-
-  void build(const char* name, bool contiguous) {
-    NodeNamer namer;
-
-    // Function parameters.
-    std::vector<std::string> params;
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      if (is_constant(i)) {
-        continue;
-      }
-      const auto& x = inputs[i];
-      const std::string& xname = namer.get_name(x);
-      params.push_back(
-          fmt::format("const {}* {}", dtype_to_cuda_type(x.dtype()), xname));
-      if (!is_scalar(x) && !contiguous) {
-        params.push_back(fmt::format(
-            "const __grid_constant__ cuda::std::array<int64_t, NDIM> {}_strides",
-            xname));
-      }
-    }
-    for (const auto& x : outputs) {
-      params.push_back(fmt::format(
-          "{}* {}", dtype_to_cuda_type(x.dtype()), namer.get_name(x)));
-    }
-    if (!contiguous) {
-      params.push_back(
-          "const __grid_constant__ cuda::std::array<int32_t, NDIM> shape");
-    }
-    params.push_back("IdxT size");
-
-    // Build function signature.
-    if (contiguous) {
-      os += "template <typename IdxT = uint32_t>\n";
-    } else {
-      os += "template <int NDIM, typename IdxT = uint32_t>\n";
-    }
-    os += fmt::format("__global__ void {}(\n", kernel_name + name);
-    for (size_t i = 0; i < params.size(); ++i) {
-      os += "    ";
-      os += params[i];
-      if (i != params.size() - 1) {
-        os += ",\n";
-      }
-    }
-    os += ") {\n";
-
-    // Index.
-    os +=
-        "  IdxT index = cg::this_grid().thread_rank();\n"
-        "  if (index >= size) {\n"
-        "    return;\n"
-        "  }\n";
-
-    // Read inputs.
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      const auto& x = inputs[i];
-      const std::string& xname = namer.get_name(x);
-      std::string type = dtype_to_cuda_type(x.dtype());
-      std::string value;
-      if (is_constant(i)) {
-        std::ostringstream ss;
-        print_constant(ss, x);
-        value = fmt::format("static_cast<{}>({})", type, ss.str());
-      } else if (is_scalar(x)) {
-        value = fmt::format("{}[0]", xname);
-      } else if (contiguous) {
-        value = fmt::format("{}[index]", xname);
-      } else {
-        std::string index = fmt::format(
-            "elem_to_loc_nd<NDIM>(index, shape.data(), {}_strides.data())",
-            xname);
-        value = fmt::format("{}[{}]", xname, index);
-      }
-      os += fmt::format("  {} tmp_{} = {};\n", type, xname, value);
-    }
-
-    // Write tape.
-    for (const auto& x : tape) {
-      const std::string& xname = namer.get_name(x);
-      std::string type = dtype_to_cuda_type(x.dtype());
-      std::string value;
-      if (is_static_cast(x.primitive())) {
-        value = fmt::format(
-            "static_cast<{}>(tmp_{})", type, namer.get_name(x.inputs()[0]));
-      } else {
-        value = x.primitive().name();
-        value += "{}(";
-        for (size_t i = 0; i < x.inputs().size() - 1; ++i) {
-          value += fmt::format("tmp_{}, ", namer.get_name(x.inputs()[i]));
-        }
-        value += fmt::format("tmp_{})", namer.get_name(x.inputs().back()));
-      }
-      os += fmt::format("  {} tmp_{} = {};\n", type, xname, value);
-    }
-
-    // Write output.
-    for (const auto& x : outputs) {
-      os += fmt::format("  {0}[index] = tmp_{0};\n", namer.get_name(x));
-    }
-
-    os += "}\n";
-  }
-};
-
-} // namespace cu
-
-constexpr const char* g_jit_includes = R"(
-#include "mlx/backend/cuda/device/binary_ops.cuh"
-#include "mlx/backend/cuda/device/ternary_ops.cuh"
-#include "mlx/backend/cuda/device/unary_ops.cuh"
-#include "mlx/backend/cuda/device/utils.cuh"
-
-#include <cooperative_groups.h>
-
-#define inf cuda::std::numeric_limits<float>::infinity()
-)";
-
-void Compiled::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  nvtx3::scoped_range r("Compiled::eval_gpu");
-  auto& s = stream();
-
-  cu::JitModule& mod = cu::get_jit_module(s.device, lib_name(), [&]() {
-    // Build source code.
-    cu::FusedKernelBuilder builder{
-        g_jit_includes, lib_name(), inputs_, outputs_, tape_, is_constant_};
-    builder.os +=
-        "namespace mlx::core::cu {\n\n"
-        "namespace cg = cooperative_groups;\n\n";
-    builder.build("_contiguous", true);
-    builder.os += "\n";
-    builder.build("_strided", false);
-    builder.os += "\n} // namespace mlx::core::cu\n";
-    // Build kernel names.
-    std::vector<std::string> kernel_names = {
-        fmt::format("mlx::core::cu::{}_contiguous<uint32_t>", lib_name()),
-        fmt::format("mlx::core::cu::{}_contiguous<int64_t>", lib_name()),
-    };
-    for (int i = 1; i <= MAX_NDIM; ++i) {
-      kernel_names.push_back(fmt::format(
-          "mlx::core::cu::{}_strided<{}, uint32_t>", lib_name(), i));
-      kernel_names.push_back(
-          fmt::format("mlx::core::cu::{}_strided<{}, int64_t>", lib_name(), i));
-    }
-    return std::make_pair(std::move(builder.os), std::move(kernel_names));
-  });
-
-  // Collapse contiguous dims to route to a faster kernel if possible. Also
-  // handle all broadcasting.
-  auto [contiguous, shape, strides_vec] =
-      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);
-
-  // Whether to use large index.
-  bool large = compiled_use_large_index(inputs, outputs, contiguous);
-
-  cu::KernelArgs args;
-  // Put inputs.
-  int strides_index = 1;
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    if (is_constant_(i)) {
-      continue;
-    }
-    const auto& x = inputs[i];
-    args.append(x);
-    if (!contiguous && !is_scalar(x)) {
-      args.append_ptr(strides_vec[strides_index++].data());
-    }
-  }
-
-  // Put outputs.
-  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);
-  for (auto& x : outputs) {
-    args.append(x);
-  }
-
-  // Put shape and size.
-  if (!contiguous) {
-    args.append_ptr(shape.data());
-  }
-  if (large) {
-    args.append<int64_t>(outputs[0].data_size());
-  } else {
-    args.append<uint32_t>(outputs[0].data_size());
-  }
-
-  // Launch kernel.
-  const char* index_type = large ? "int64_t" : "uint32_t";
-  std::string kernel_name = fmt::format("mlx::core::cu::{}", lib_name());
-  if (contiguous) {
-    kernel_name += fmt::format("_contiguous<{}>", index_type);
-  } else {
-    kernel_name += fmt::format("_strided<{}, {}>", shape.size(), index_type);
-  }
-  auto& encoder = cu::get_command_encoder(s);
-  for (const auto& in : inputs) {
-    encoder.set_input_array(in);
-  }
-  for (const auto& out : outputs) {
-    encoder.set_output_array(out);
-  }
-
-  auto kernel = mod.get_kernel(kernel_name);
-  auto [num_blocks, block_dims] = get_launch_args(kernel, outputs[0], large);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/copy.cpp
+++ b/mlx/backend/cuda/copy.cpp
@@ -0,0 +1,26 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/gpu/copy.h"
+
+namespace mlx::core {
+
+void copy_gpu_inplace(
+    const array& in,
+    array& out,
+    const Shape& data_shape,
+    const Strides& strides_in_pre,
+    const Strides& strides_out_pre,
+    int64_t inp_offset,
+    int64_t out_offset,
+    CopyType ctype,
+    const Stream& s,
+    const std::optional<array>& dynamic_i_offset /* = std::nullopt */,
+    const std::optional<array>& dynamic_o_offset /* = std::nullopt */) {
+  throw std::runtime_error("copy_gpu_inplace not implemented in CUDA backend.");
+}
+
+void fill_gpu(const array& val, array& out, const Stream& s) {
+  throw std::runtime_error("fill_gpu not implemented in CUDA backend.");
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -1,87 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/common/utils.h"
-#include "mlx/backend/cuda/copy/copy.cuh"
-
-namespace mlx::core {
-
-void copy_gpu_inplace(
-    const array& in,
-    array& out,
-    const Shape& shape,
-    const Strides& strides_in,
-    const Strides& strides_out,
-    int64_t offset_in,
-    int64_t offset_out,
-    CopyType ctype,
-    const Stream& s,
-    const std::optional<array>& dynamic_offset_in,
-    const std::optional<array>& dynamic_offset_out) {
-  if (out.size() == 0) {
-    return;
-  }
-
-  auto& encoder = cu::get_command_encoder(s);
-  encoder.set_input_array(in);
-  encoder.set_output_array(out);
-  if (ctype == CopyType::Scalar || ctype == CopyType::Vector) {
-    copy_contiguous(encoder, ctype, in, out, offset_in, offset_out);
-    return;
-  }
-
-  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
-    auto [shape_collapsed, strides_vec] = collapse_contiguous_dims(
-        shape, std::vector{strides_in, strides_out}, INT32_MAX);
-    if (ctype == CopyType::General) {
-      copy_general_input(
-          encoder,
-          ctype,
-          in,
-          out,
-          offset_in,
-          offset_out,
-          shape_collapsed,
-          strides_vec[0]);
-    } else {
-      if (dynamic_offset_in || dynamic_offset_out) {
-        copy_general_dynamic(
-            encoder,
-            ctype,
-            in,
-            out,
-            offset_in,
-            offset_out,
-            shape_collapsed,
-            strides_vec[0],
-            strides_vec[1],
-            dynamic_offset_in ? *dynamic_offset_in : array(0, int64),
-            dynamic_offset_out ? *dynamic_offset_out : array(0, int64));
-      } else {
-        copy_general(
-            encoder,
-            ctype,
-            in,
-            out,
-            offset_in,
-            offset_out,
-            shape_collapsed,
-            strides_vec[0],
-            strides_vec[1]);
-      }
-    }
-    return;
-  }
-}
-
-void fill_gpu(const array& in, array& out, const Stream& s) {
-  if (out.size() == 0) {
-    return;
-  }
-  out.set_data(allocator::malloc(out.nbytes()));
-  auto& encoder = cu::get_command_encoder(s);
-  encoder.set_input_array(in);
-  encoder.set_output_array(out);
-  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy.cuh
+++ b/mlx/backend/cuda/copy/copy.cuh
@@ -1,55 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/cast_op.cuh"
-#include "mlx/backend/cuda/kernel_utils.cuh"
-#include "mlx/backend/gpu/copy.h"
-#include "mlx/dtype_utils.h"
-
-namespace mlx::core {
-
-void copy_contiguous(
-    cu::CommandEncoder& encoder,
-    CopyType ctype,
-    const array& in,
-    array& out,
-    int64_t offset_in,
-    int64_t offset_out);
-
-void copy_general(
-    cu::CommandEncoder& encoder,
-    CopyType ctype,
-    const array& in,
-    array& out,
-    int64_t offset_in,
-    int64_t offset_out,
-    const Shape& shape,
-    const Strides& strides_in,
-    const Strides& strides_out);
-
-void copy_general_dynamic(
-    cu::CommandEncoder& encoder,
-    CopyType ctype,
-    const array& in,
-    array& out,
-    int64_t offset_in,
-    int64_t offset_out,
-    const Shape& shape,
-    const Strides& strides_in,
-    const Strides& strides_out,
-    const array& dynamic_offset_in,
-    const array& dynamic_offset_out);
-
-void copy_general_input(
-    cu::CommandEncoder& encoder,
-    CopyType ctype,
-    const array& in,
-    array& out,
-    int64_t offset_in,
-    int64_t offset_out,
-    const Shape& shape,
-    const Strides& strides_in);
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -1,93 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/copy/copy.cuh"
-
-#include <cooperative_groups.h>
-
-namespace mlx::core {
-
-namespace cu {
-
-namespace cg = cooperative_groups;
-
-template <typename In, typename Out, typename IdxT, int N_READS>
-__global__ void copy_s(const In* in, Out* out, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-
-  if ((index + 1) * N_READS > size) {
-    for (IdxT i = index * N_READS; i < size; ++i) {
-      out[i] = cast_to<Out>(in[0]);
-    }
-  } else {
-    AlignedVector<Out, N_READS> out_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      out_vec.val[i] = cast_to<Out>(in[0]);
-    }
-
-    store_vector<N_READS>(out, index, out_vec);
-  }
-}
-
-template <typename In, typename Out, typename IdxT, int N_READS>
-__global__ void copy_v(const In* in, Out* out, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-
-  if ((index + 1) * N_READS > size) {
-    for (IdxT i = index * N_READS; i < size; ++i) {
-      out[i] = cast_to<Out>(in[i]);
-    }
-  } else {
-    auto in_vec = load_vector<N_READS>(in, index);
-
-    AlignedVector<Out, N_READS> out_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      out_vec.val[i] = cast_to<Out>(in_vec.val[i]);
-    }
-
-    store_vector<N_READS>(out, index, out_vec);
-  }
-}
-
-} // namespace cu
-
-void copy_contiguous(
-    cu::CommandEncoder& encoder,
-    CopyType ctype,
-    const array& in,
-    array& out,
-    int64_t in_offset,
-    int64_t out_offset) {
-  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
-        using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-        using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-        // TODO: Choose optimized value based on type size.
-        constexpr int N_READS = 4;
-        auto kernel = cu::copy_s<InType, OutType, IdxT, N_READS>;
-        if (ctype == CopyType::Vector) {
-          kernel = cu::copy_v<InType, OutType, IdxT, N_READS>;
-        }
-        auto [num_blocks, block_dims] = get_launch_args(
-            kernel,
-            out.data_size(),
-            out.shape(),
-            out.strides(),
-            large(),
-            N_READS);
-        encoder.add_kernel_node(
-            kernel,
-            num_blocks,
-            block_dims,
-            in.data<InType>() + in_offset,
-            out.data<OutType>() + out_offset,
-            out.data_size());
-      });
-    });
-  });
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -1,110 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/copy/copy.cuh"
-
-#include <cooperative_groups.h>
-
-namespace mlx::core {
-
-namespace cu {
-
-namespace cg = cooperative_groups;
-
-template <typename In, typename Out, typename IdxT, int NDIM>
-__global__ void copy_gg_nd(
-    const In* in,
-    Out* out,
-    IdxT size,
-    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
-        index, shape.data(), strides_in.data(), strides_out.data());
-    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
-  }
-}
-
-template <typename In, typename Out, typename IdxT>
-__global__ void copy_gg(
-    const In* in,
-    Out* out,
-    IdxT size,
-    const __grid_constant__ Shape shape,
-    const __grid_constant__ Strides strides_in,
-    const __grid_constant__ Strides strides_out,
-    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [idx_in, idx_out] = elem_to_loc_4d(
-        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
-    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
-  }
-}
-
-} // namespace cu
-
-void copy_general(
-    cu::CommandEncoder& encoder,
-    CopyType ctype,
-    const array& in,
-    array& out,
-    int64_t offset_in,
-    int64_t offset_out,
-    const Shape& shape,
-    const Strides& strides_in,
-    const Strides& strides_out) {
-  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      dispatch_bool(
-          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-          [&](auto large) {
-            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-            const InType* in_ptr = in.data<InType>() + offset_in;
-            OutType* out_ptr = out.data<OutType>() + offset_out;
-            int ndim = shape.size();
-            size_t data_size = 1;
-            for (auto& s : shape)
-              data_size *= s;
-            if (ndim <= 3) {
-              dispatch_1_2_3(ndim, [&](auto ndim_constant) {
-                auto kernel =
-                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>;
-                auto [num_blocks, block_dims] = get_launch_args(
-                    kernel, data_size, shape, out.strides(), large());
-                encoder.add_kernel_node(
-                    kernel,
-                    num_blocks,
-                    block_dims,
-                    in_ptr,
-                    out_ptr,
-                    data_size,
-                    const_param<ndim_constant()>(shape),
-                    const_param<ndim_constant()>(strides_in),
-                    const_param<ndim_constant()>(strides_out));
-              });
-            } else { // ndim >= 4
-              auto kernel = cu::copy_gg<InType, OutType, IdxT>;
-              auto [num_blocks, block_dims] = get_launch_args(
-                  kernel, data_size, shape, out.strides(), large());
-              encoder.add_kernel_node(
-                  kernel,
-                  num_blocks,
-                  block_dims,
-                  in_ptr,
-                  out_ptr,
-                  data_size,
-                  const_param(shape),
-                  const_param(strides_in),
-                  const_param(strides_out),
-                  ndim);
-            }
-          });
-    });
-  });
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -1,117 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/copy/copy.cuh"
-
-#include <cooperative_groups.h>
-
-namespace mlx::core {
-
-namespace cu {
-
-namespace cg = cooperative_groups;
-
-template <typename In, typename Out, typename IdxT, int NDIM>
-__global__ void copy_gg_dynamic_nd(
-    const In* in,
-    Out* out,
-    IdxT size,
-    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out,
-    const int64_t* offset_in,
-    const int64_t* offset_out) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
-        index, shape.data(), strides_in.data(), strides_out.data());
-    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
-  }
-}
-
-template <typename In, typename Out, typename IdxT>
-__global__ void copy_gg_dynamic(
-    const In* in,
-    Out* out,
-    IdxT size,
-    const __grid_constant__ Shape shape,
-    const __grid_constant__ Strides strides_in,
-    const __grid_constant__ Strides strides_out,
-    int ndim,
-    const int64_t* offset_in,
-    const int64_t* offset_out) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [idx_in, idx_out] = elem_to_loc_4d(
-        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
-    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
-  }
-}
-
-} // namespace cu
-
-void copy_general_dynamic(
-    cu::CommandEncoder& encoder,
-    CopyType ctype,
-    const array& in,
-    array& out,
-    int64_t offset_in,
-    int64_t offset_out,
-    const Shape& shape,
-    const Strides& strides_in,
-    const Strides& strides_out,
-    const array& dynamic_offset_in,
-    const array& dynamic_offset_out) {
-  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      dispatch_bool(
-          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-          [&](auto large) {
-            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-            const InType* in_ptr = in.data<InType>() + offset_in;
-            OutType* out_ptr = out.data<OutType>() + offset_out;
-            int ndim = shape.size();
-            if (ndim <= 3) {
-              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto kernel = cu::
-                    copy_gg_dynamic_nd<InType, OutType, IdxT, dims_constant()>;
-                auto [num_blocks, block_dims] =
-                    get_launch_args(kernel, out, large());
-                encoder.add_kernel_node(
-                    kernel,
-                    num_blocks,
-                    block_dims,
-                    in_ptr,
-                    out_ptr,
-                    out.size(),
-                    const_param<dims_constant()>(shape),
-                    const_param<dims_constant()>(strides_in),
-                    const_param<dims_constant()>(strides_out),
-                    dynamic_offset_in.data<int64_t>(),
-                    dynamic_offset_out.data<int64_t>());
-              });
-            } else { // ndim >= 4
-              auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large());
-              encoder.add_kernel_node(
-                  kernel,
-                  num_blocks,
-                  block_dims,
-                  in_ptr,
-                  out_ptr,
-                  out.size(),
-                  const_param(shape),
-                  const_param(strides_in),
-                  const_param(strides_out),
-                  ndim,
-                  dynamic_offset_in.data<int64_t>(),
-                  dynamic_offset_out.data<int64_t>());
-            }
-          });
-    });
-  });
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -1,100 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/copy/copy.cuh"
-
-#include <cooperative_groups.h>
-
-namespace mlx::core {
-
-namespace cu {
-
-namespace cg = cooperative_groups;
-
-template <typename In, typename Out, typename IdxT, int NDIM>
-__global__ void copy_g_nd(
-    const In* in,
-    Out* out,
-    IdxT size,
-    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    IdxT idx_in = elem_to_loc_nd<NDIM>(index, shape.data(), strides_in.data());
-    out[index] = CastOp<In, Out>{}(in[idx_in]);
-  }
-}
-
-template <typename In, typename Out, typename IdxT>
-__global__ void copy_g(
-    const In* in,
-    Out* out,
-    IdxT size,
-    const __grid_constant__ Shape shape,
-    const __grid_constant__ Strides strides_in,
-    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    IdxT idx_in = elem_to_loc_4d(index, shape.data(), strides_in.data(), ndim);
-    out[index] = CastOp<In, Out>{}(in[idx_in]);
-  }
-}
-
-} // namespace cu
-
-void copy_general_input(
-    cu::CommandEncoder& encoder,
-    CopyType ctype,
-    const array& in,
-    array& out,
-    int64_t offset_in,
-    int64_t offset_out,
-    const Shape& shape,
-    const Strides& strides_in) {
-  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      dispatch_bool(
-          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-          [&](auto large) {
-            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-            const InType* in_ptr = in.data<InType>() + offset_in;
-            OutType* out_ptr = out.data<OutType>() + offset_out;
-            int ndim = shape.size();
-            if (ndim <= 3) {
-              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto kernel =
-                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>;
-                auto [num_blocks, block_dims] =
-                    get_launch_args(kernel, out, large());
-                encoder.add_kernel_node(
-                    kernel,
-                    num_blocks,
-                    block_dims,
-                    in_ptr,
-                    out_ptr,
-                    out.size(),
-                    const_param<dims_constant()>(shape),
-                    const_param<dims_constant()>(strides_in));
-              });
-            } else { // ndim >= 4
-              auto kernel = cu::copy_g<InType, OutType, IdxT>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large());
-              encoder.add_kernel_node(
-                  kernel,
-                  num_blocks,
-                  block_dims,
-                  in_ptr,
-                  out_ptr,
-                  out.size(),
-                  const_param(shape),
-                  const_param(strides_in),
-                  ndim);
-            }
-          });
-    });
-  });
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/cuda.cpp
+++ b/mlx/backend/cuda/cuda.cpp
@@ -1,11 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/cuda.h"
-
-namespace mlx::core::cu {
-
-bool is_available() {
-  return true;
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/cuda.h
+++ b/mlx/backend/cuda/cuda.h
@@ -1,10 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-namespace mlx::core::cu {
-
-/* Check if the CUDA backend is available. */
-bool is_available();
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -2,49 +2,46 @@

 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/worker.h"
-#include "mlx/utils.h"
+#include "mlx/backend/metal/metal.h"

 #include <fmt/format.h>
 #include <nvtx3/nvtx3.hpp>
-#include <future>
-#include <unordered_set>

 namespace mlx::core {

-// Can be tuned with MLX_MAX_OPS_PER_BUFFER
-// This should be less than 255
-constexpr int default_max_nodes_per_graph = 20;
-
-int cuda_graph_cache_size() {
-  static int cache_size = []() {
-    return env::get_var("MLX_CUDA_GRAPH_CACHE_SIZE", 100);
-  }();
-  return cache_size;
-}
-
 namespace cu {

+DeviceStream::DeviceStream(Device& device) : device_(device), stream_(device) {}
+
+void DeviceStream::synchronize() {
+  cudaStreamSynchronize(stream_);
+}
+
+cudaStream_t DeviceStream::schedule_cuda_stream() {
+  // TODO: Return a stream that maximizes parallelism.
+  return stream_;
+}
+
+cudaStream_t DeviceStream::last_cuda_stream() {
+  return stream_;
+}
+
+CommandEncoder& DeviceStream::get_encoder() {
+  if (!encoder_) {
+    encoder_ = std::make_unique<CommandEncoder>(*this);
+  }
+  return *encoder_;
+}
+
 Device::Device(int device) : device_(device) {
-  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
-      &compute_capability_major_, cudaDevAttrComputeCapabilityMajor, device_));
-  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
-      &compute_capability_minor_, cudaDevAttrComputeCapabilityMinor, device_));
  // Validate the requirements of device.
  int attr = 0;
-  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
-      &attr, cudaDevAttrConcurrentManagedAccess, device_));
+  cudaDeviceGetAttribute(&attr, cudaDevAttrConcurrentManagedAccess, device_);
  if (attr != 1) {
    throw std::runtime_error(fmt::format(
        "Device {} does not support synchronization in managed memory.",
        device_));
  }
-  // The cublasLt handle is used by matmul.
-  make_current();
-  cublasLtCreate(&lt_);
-}
-
-Device::~Device() {
-  cublasLtDestroy(lt_);
 }

 void Device::make_current() {
@@ -57,268 +54,45 @@ void Device::make_current() {
  }
 }

-CommandEncoder& Device::get_command_encoder(Stream s) {
-  auto it = encoders_.find(s.index);
-  if (it == encoders_.end()) {
-    it = encoders_.try_emplace(s.index, *this).first;
+DeviceStream& Device::get_stream(Stream s) {
+  auto it = streams_.find(s.index);
+  if (it == streams_.end()) {
+    it = streams_.try_emplace(s.index, *this).first;
  }
  return it->second;
 }

-CommandEncoder::CaptureContext::CaptureContext(CommandEncoder& enc) : enc(enc) {
-  CHECK_CUDA_ERROR(cudaGraphCreate(&graph, 0));
-  CHECK_CUDA_ERROR(
-      cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeGlobal));
-}
-
-CommandEncoder::CaptureContext::~CaptureContext() {
-  CHECK_CUDA_ERROR(cudaStreamEndCapture(enc.stream(), &graph));
-  size_t num_nodes;
-  CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, NULL, &num_nodes));
-  if (num_nodes == 1) {
-    cudaGraphNode_t captured_node;
-    CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, &captured_node, &num_nodes));
-    CUDA_KERNEL_NODE_PARAMS params;
-    CHECK_CUDA_ERROR(cuGraphKernelNodeGetParams(captured_node, &params));
-    cudaGraphNode_t node;
-    CHECK_CUDA_ERROR(cuGraphAddKernelNode(&node, enc.graph_, NULL, 0, &params));
-    enc.insert_graph_dependencies(GraphNode{node, 'K'});
-  } else {
-    cudaGraphNode_t node;
-    CHECK_CUDA_ERROR(
-        cudaGraphAddChildGraphNode(&node, enc.graph_, NULL, 0, graph));
-    enc.insert_graph_dependencies(GraphNode{node, 'G'});
-  }
-  CHECK_CUDA_ERROR(cudaGraphDestroy(graph));
-}
-
-CommandEncoder::ConcurrentContext::ConcurrentContext(CommandEncoder& enc)
-    : enc(enc) {
-  enc.in_concurrent_ = true;
-}
-
-CommandEncoder::ConcurrentContext::~ConcurrentContext() {
-  enc.in_concurrent_ = false;
-
-  // Use an empty graph node for synchronization
-  CommandEncoder::GraphNode empty{NULL, 'E', std::to_string(enc.node_count_++)};
-  enc.empty_node_count_++;
-  CHECK_CUDA_ERROR(cudaGraphAddEmptyNode(&empty.node, enc.graph_, NULL, 0));
-
-  // Insert the concurrent -> empty node dependencies
-  for (auto& from : enc.concurrent_nodes_) {
-    enc.from_nodes_.push_back(from.node);
-    enc.to_nodes_.push_back(empty.node);
-    enc.graph_key_ += from.id;
-    enc.graph_key_ += from.node_type;
-    enc.graph_key_ += empty.id;
-    enc.graph_key_ += empty.node_type;
-  }
-
-  // Insert the input -> concurrent node dependencies without updating output
-  // nodes
-  auto outputs = std::move(enc.active_outputs_);
-  enc.insert_graph_dependencies(std::move(enc.concurrent_nodes_));
-
-  // Update output node to be the empty node
-  for (auto o : outputs) {
-    enc.node_map_.emplace(o, empty).first->second = empty;
-  }
-}
-
-void CommandEncoder::insert_graph_dependencies(GraphNode node) {
-  if (node.node_type == 'G') {
-    graph_node_count_++;
-  }
-  node.id = std::to_string(node_count_++);
-  if (in_concurrent_) {
-    concurrent_nodes_.push_back(std::move(node));
-  } else {
-    std::vector<GraphNode> nodes;
-    nodes.push_back(std::move(node));
-    insert_graph_dependencies(std::move(nodes));
-  }
-}
-
-void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
-  std::vector<GraphNode> deps;
-  {
-    // Dependencies must be added in the same order to produce a consistent
-    // topology
-    std::unordered_set<cudaGraphNode_t> set_deps;
-    for (auto d : active_deps_) {
-      if (auto it = node_map_.find(d); it != node_map_.end()) {
-        auto [_, inserted] = set_deps.insert(it->second.node);
-        if (inserted) {
-          deps.push_back(it->second);
-        }
-      }
-    }
-  }
-  active_deps_.clear();
-
-  for (auto o : active_outputs_) {
-    for (auto& node : nodes) {
-      node_map_.emplace(o, node).first->second = node;
-    }
-  }
-  active_outputs_.clear();
-
-  for (auto& from : deps) {
-    for (auto& to : nodes) {
-      from_nodes_.push_back(from.node);
-      to_nodes_.push_back(to.node);
-      graph_key_ += from.id;
-      graph_key_ += from.node_type;
-      graph_key_ += to.id;
-      graph_key_ += to.node_type;
-    }
-  }
-}
-
-CommandEncoder::CommandEncoder(Device& d) : device_(d), stream_(d) {
-  CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
-}
-
-void clear_graphs(std::unordered_map<std::string, cudaGraphExec_t>& graphs) {
-  for (auto& [_, graph_exec] : graphs) {
-    CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
-  }
-  graphs.clear();
-}
-
-CommandEncoder::~CommandEncoder() {
-  clear_graphs(graph_cache_);
-}
+CommandEncoder::CommandEncoder(DeviceStream& s)
+    : device_(s.device()), stream_(s) {}

 void CommandEncoder::add_completed_handler(std::function<void()> task) {
  worker_.add_task(std::move(task));
 }

-void CommandEncoder::set_input_array(const array& arr) {
-  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
-  active_deps_.push_back(id);
-}
+void CommandEncoder::end_encoding() {
+  if (!temporaries_.empty()) {
+    add_completed_handler([temporaries = std::move(temporaries_)]() {});
+  }

-void CommandEncoder::set_output_array(const array& arr) {
-  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
-  active_deps_.push_back(id);
-  active_outputs_.push_back(id);
-}
+  // There is no kernel running, run completion handlers immediately.
+  if (!has_gpu_work_) {
+    worker_.consume_in_this_thread();
+    return;
+  }
+  has_gpu_work_ = false;

-void CommandEncoder::maybe_commit() {
-  if (node_count_ >= env::max_ops_per_buffer(default_max_nodes_per_graph)) {
+  // Put completion handlers in a batch.
+  worker_.end_batch();
+
+  // Signaling kernel completion is expensive, delay until enough batches.
+  // TODO: This number is arbitrarily picked, profile for a better stragety.
+  if (worker_.uncommited_batches() > 8) {
    commit();
  }
 }

-void CommandEncoder::add_kernel_node(
-    void* func,
-    dim3 grid_dim,
-    dim3 block_dim,
-    void** params) {
-  cudaKernelNodeParams kernel_params = {0};
-  kernel_params.func = func;
-  kernel_params.gridDim = grid_dim;
-  kernel_params.blockDim = block_dim;
-  kernel_params.kernelParams = params;
-  cudaGraphNode_t node;
-  CHECK_CUDA_ERROR(
-      cudaGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
-  insert_graph_dependencies(GraphNode{node, 'K'});
-}
-
-void CommandEncoder::add_kernel_node(
-    CUfunction func,
-    dim3 grid_dim,
-    dim3 block_dim,
-    void** params) {
-  CUDA_KERNEL_NODE_PARAMS kernel_params = {0};
-  kernel_params.func = func;
-  kernel_params.gridDimX = grid_dim.x;
-  kernel_params.gridDimY = grid_dim.y;
-  kernel_params.gridDimZ = grid_dim.z;
-  kernel_params.blockDimX = block_dim.x;
-  kernel_params.blockDimY = block_dim.y;
-  kernel_params.blockDimZ = block_dim.z;
-  kernel_params.kernelParams = params;
-  CUgraphNode node;
-  CHECK_CUDA_ERROR(
-      cuGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
-  insert_graph_dependencies(GraphNode{node, 'K'});
-}
-
 void CommandEncoder::commit() {
-  if (!temporaries_.empty()) {
-    add_completed_handler([temporaries = std::move(temporaries_)]() {});
-  }
-  if (node_count_ > 0) {
-    if (!from_nodes_.empty()) {
-      CHECK_CUDA_ERROR(cudaGraphAddDependencies(
-          graph_, from_nodes_.data(), to_nodes_.data(), from_nodes_.size()));
-    }
-
-    graph_key_ += ".";
-    graph_key_ += std::to_string(node_count_);
-    graph_key_ += ".";
-    graph_key_ += std::to_string(graph_node_count_);
-    graph_key_ += ".";
-    graph_key_ += std::to_string(empty_node_count_);
-
-    cudaGraphExec_t& graph_exec = graph_cache_[graph_key_];
-
-    if (graph_exec != nullptr) {
-      cudaGraphExecUpdateResult update_result;
-#if CUDART_VERSION >= 12000
-      cudaGraphExecUpdateResultInfo info;
-      cudaGraphExecUpdate(graph_exec, graph_, &info);
-      update_result = info.result;
-#else
-      cudaGraphNode_t error_node;
-      cudaGraphExecUpdate(graph_exec, graph_, &error_node, &update_result);
-#endif // CUDART_VERSION >= 12000
-      if (update_result != cudaGraphExecUpdateSuccess) {
-        cudaGetLastError(); // reset error
-        CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
-        graph_exec = nullptr;
-      }
-    }
-    if (graph_exec == nullptr) {
-      CHECK_CUDA_ERROR(
-          cudaGraphInstantiate(&graph_exec, graph_, NULL, NULL, 0));
-    }
-    device_.make_current();
-    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
-
-    // TODO smarter cache policy
-    if (graph_cache_.size() > cuda_graph_cache_size()) {
-      clear_graphs(graph_cache_);
-    }
-
-    // Reset state
-    node_count_ = 0;
-    graph_node_count_ = 0;
-    from_nodes_.clear();
-    to_nodes_.clear();
-    graph_key_.clear();
-    node_map_.clear();
-    CHECK_CUDA_ERROR(cudaGraphDestroy(graph_));
-    CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
-  }
-
-  // Put completion handlers in a batch.
-  worker_.end_batch();
-  worker_.commit(stream_);
-}
-
-void CommandEncoder::synchronize() {
-  cudaStreamSynchronize(stream_);
-  auto p = std::make_shared<std::promise<void>>();
-  std::future<void> f = p->get_future();
-  add_completed_handler([p = std::move(p)]() { p->set_value(); });
-  worker_.end_batch();
-  commit();
-  f.wait();
+  worker_.commit(stream_.last_cuda_stream());
 }

 Device& device(mlx::core::Device device) {
@@ -330,8 +104,12 @@ Device& device(mlx::core::Device device) {
  return it->second;
 }

+DeviceStream& get_stream(Stream s) {
+  return device(s.device).get_stream(s);
+}
+
 CommandEncoder& get_command_encoder(Stream s) {
-  return device(s.device).get_command_encoder(s);
+  return get_stream(s).get_encoder();
 }

 } // namespace cu
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@@ -6,116 +6,46 @@
 #include "mlx/backend/cuda/worker.h"
 #include "mlx/stream.h"

-#include <cublasLt.h>
-#include <cuda.h>
 #include <thrust/execution_policy.h>

 #include <unordered_map>

 namespace mlx::core::cu {

-class CommandEncoder {
+class Device;
+class CommandEncoder;
+
+class DeviceStream {
 public:
-  struct CaptureContext {
-    CaptureContext(CommandEncoder& enc);
-    ~CaptureContext();
-    cudaGraph_t graph;
-    CommandEncoder& enc;
-  };
-  struct ConcurrentContext {
-    ConcurrentContext(CommandEncoder& enc);
-    ~ConcurrentContext();
-    CommandEncoder& enc;
-  };
+  explicit DeviceStream(Device& device);

-  explicit CommandEncoder(Device& d);
-  ~CommandEncoder();
+  DeviceStream(const DeviceStream&) = delete;
+  DeviceStream& operator=(const DeviceStream&) = delete;

-  CommandEncoder(const CommandEncoder&) = delete;
-  CommandEncoder& operator=(const CommandEncoder&) = delete;
-
-  CaptureContext capture_context() {
-    return CaptureContext{*this};
-  }
-  ConcurrentContext concurrent_context() {
-    return ConcurrentContext{*this};
-  }
-
-  void set_input_array(const array& arr);
-  void set_output_array(const array& arr);
-
-  template <typename F, typename... Params>
-  void
-  add_kernel_node(F* func, dim3 grid_dim, dim3 block_dim, Params&&... params) {
-    constexpr size_t num = sizeof...(Params);
-    void* ptrs[num];
-    size_t i = 0;
-    ([&](auto&& p) { ptrs[i++] = static_cast<void*>(&p); }(
-         std::forward<Params>(params)),
-     ...);
-    add_kernel_node((void*)func, grid_dim, block_dim, ptrs);
-  }
-
-  void add_kernel_node(
-      CUfunction func,
-      dim3 grid_dim,
-      dim3 block_dim,
-      void** params);
-
-  void
-  add_kernel_node(void* func, dim3 grid_dim, dim3 block_dim, void** params);
-
-  void add_temporary(const array& arr) {
-    temporaries_.push_back(arr.data_shared_ptr());
-  }
-
-  void add_completed_handler(std::function<void()> task);
-  void maybe_commit();
-  void commit();
-
-  CudaStream& stream() {
-    return stream_;
-  }
-
-  // Wait until kernels and completion handlers are finished
+  // Wait until kernels in the stream complete.
  void synchronize();

+  // Return a cuda stream for launching kernels.
+  cudaStream_t schedule_cuda_stream();
+
+  // Return the last cuda stream used.
+  cudaStream_t last_cuda_stream();
+
+  CommandEncoder& get_encoder();
+
+  Device& device() {
+    return device_;
+  }
+
 private:
-  struct GraphNode {
-    cudaGraphNode_t node;
-    // K = kernel
-    // E = empty
-    // G = subgraph
-    char node_type;
-    std::string id;
-  };
-
-  void insert_graph_dependencies(GraphNode node);
-  void insert_graph_dependencies(std::vector<GraphNode> nodes);
-
  Device& device_;
  CudaStream stream_;
-  cudaGraph_t graph_;
-  Worker worker_;
-  char node_count_{0};
-  char graph_node_count_{0};
-  char empty_node_count_{0};
-  bool in_concurrent_{false};
-  std::vector<cudaGraphNode_t> from_nodes_;
-  std::vector<cudaGraphNode_t> to_nodes_;
-  std::string graph_key_;
-  std::vector<GraphNode> concurrent_nodes_;
-  std::vector<std::shared_ptr<array::Data>> temporaries_;
-  std::unordered_map<std::string, cudaGraphExec_t> graph_cache_;
-  std::vector<std::uintptr_t> active_deps_;
-  std::vector<std::uintptr_t> active_outputs_;
-  std::unordered_map<std::uintptr_t, GraphNode> node_map_;
+  std::unique_ptr<CommandEncoder> encoder_;
 };

 class Device {
 public:
  explicit Device(int device);
-  ~Device();

  Device(const Device&) = delete;
  Device& operator=(const Device&) = delete;
@@ -123,30 +53,72 @@ class Device {
  // Make this device the current cuda device, required by some cuda calls.
  void make_current();

-  CommandEncoder& get_command_encoder(Stream s);
+  DeviceStream& get_stream(Stream s);

  int cuda_device() const {
    return device_;
  }
-  int compute_capability_major() const {
-    return compute_capability_major_;
-  }
-  int compute_capability_minor() const {
-    return compute_capability_minor_;
-  }
-  cublasLtHandle_t lt_handle() const {
-    return lt_;
-  }

 private:
  int device_;
-  int compute_capability_major_;
-  int compute_capability_minor_;
-  cublasLtHandle_t lt_;
-  std::unordered_map<int, CommandEncoder> encoders_;
+  std::unordered_map<int, DeviceStream> streams_;
+};
+
+class CommandEncoder {
+ public:
+  explicit CommandEncoder(DeviceStream& stream);
+
+  CommandEncoder(const CommandEncoder&) = delete;
+  CommandEncoder& operator=(const CommandEncoder&) = delete;
+
+  void set_input_array(const array& arr) {}
+  void set_output_array(const array& arr) {}
+
+  void add_temporary(const array& arr) {
+    temporaries_.push_back(arr.data_shared_ptr());
+  }
+
+  void add_completed_handler(std::function<void()> task);
+  void end_encoding();
+  void commit();
+
+  // Schedule a cuda stream for |fun| to launch kernels, and check error
+  // afterwards.
+  template <typename F>
+  void launch_kernel(F&& fun) {
+    launch_kernel(stream_.schedule_cuda_stream(), std::forward<F>(fun));
+  }
+
+  template <typename F>
+  void launch_kernel(cudaStream_t stream, F&& fun) {
+    device_.make_current();
+    fun(stream);
+    check_cuda_error("kernel launch", cudaGetLastError());
+    has_gpu_work_ = true;
+  }
+
+  Device& device() {
+    return device_;
+  }
+
+  DeviceStream& stream() {
+    return stream_;
+  }
+
+  bool has_gpu_work() const {
+    return has_gpu_work_;
+  }
+
+ private:
+  Device& device_;
+  DeviceStream& stream_;
+  Worker worker_;
+  bool has_gpu_work_{false};
+  std::vector<std::shared_ptr<array::Data>> temporaries_;
 };

 Device& device(mlx::core::Device device);
+DeviceStream& get_stream(Stream s);
 CommandEncoder& get_command_encoder(Stream s);

 // Return an execution policy that does not sync for result.
--- a/mlx/backend/cuda/device/atomic_ops.cuh
+++ b/mlx/backend/cuda/device/atomic_ops.cuh
@@ -1,67 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include "mlx/backend/cuda/device/complex.cuh"
-#include "mlx/backend/cuda/device/fp16_math.cuh"
-
-#include <cuda/atomic>
-
-namespace mlx::core::cu {
-
-template <typename T>
-inline __device__ void atomic_add(T* out, T val) {
-  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
-  ref += val;
-}
-
-template <typename T>
-inline __device__ void atomic_prod(T* out, T val) {
-  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
-  T old = ref.load();
-  while (!ref.compare_exchange_strong(old, old * val)) {
-  }
-}
-
-template <typename T>
-inline __device__ void atomic_max(T* out, T val) {
-  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
-  ref.fetch_max(val);
-}
-
-template <typename T>
-inline __device__ void atomic_min(T* out, T val) {
-  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
-  ref.fetch_min(val);
-}
-
-// Somehow cuda::atomic_ref does not provide atomic add for following types.
-template <typename T>
-inline __device__ void atomic_add_general(T* out, T val) {
-  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
-  T old = ref.load();
-  while (!ref.compare_exchange_strong(old, old + val)) {
-  }
-}
-
-inline __device__ void atomic_add(__half* out, __half val) {
-  atomicAdd(out, val);
-}
-
-inline __device__ void atomic_add(complex64_t* out, complex64_t val) {
-#if __CUDA_ARCH__ < 900
-  atomic_add_general(out, val);
-#else
-  atomicAdd(out, val);
-#endif
-}
-
-inline __device__ void atomic_add(__nv_bfloat16* out, __nv_bfloat16 val) {
-#if __CUDA_ARCH__ < 800
-  atomic_add_general(out, val);
-#else
-  atomicAdd(out, val);
-#endif
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/binary_ops.cuh
+++ b/mlx/backend/cuda/device/binary_ops.cuh
@@ -1,293 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/device/unary_ops.cuh"
-
-#include <cuda/std/array>
-
-namespace mlx::core::cu {
-
-struct Add {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x + y;
-  }
-};
-
-struct FloorDivide {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    if constexpr (cuda::std::is_integral_v<T>) {
-      return x / y;
-    } else {
-      return truncf(x / y);
-    }
-  }
-};
-
-struct Divide {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x / y;
-  }
-};
-
-struct Remainder {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    if constexpr (cuda::std::is_integral_v<T>) {
-      if constexpr (cuda::std::is_signed_v<T>) {
-        auto r = x % y;
-        if (r != 0 && (r < 0 != y < 0)) {
-          r += y;
-        }
-        return r;
-      } else {
-        return x % y;
-      }
-    } else if constexpr (is_complex_v<T>) {
-      return x % y;
-    } else {
-      T r = fmod(x, y);
-      if (r != 0 && (r < 0 != y < 0)) {
-        r = r + y;
-      }
-      return r;
-    }
-  }
-};
-
-struct Equal {
-  template <typename T>
-  __device__ bool operator()(T x, T y) {
-    return x == y;
-  }
-};
-
-struct NaNEqual {
-  template <typename T>
-  __device__ bool operator()(T x, T y) {
-    if constexpr (is_complex_v<T>) {
-      return x == y ||
-          (isnan(x.real()) && isnan(y.real()) && isnan(x.imag()) &&
-           isnan(y.imag())) ||
-          (x.real() == y.real() && isnan(x.imag()) && isnan(y.imag())) ||
-          (isnan(x.real()) && isnan(y.real()) && x.imag() == y.imag());
-    } else {
-      return x == y || (isnan(x) && isnan(y));
-    }
-  }
-};
-
-struct Greater {
-  template <typename T>
-  __device__ bool operator()(T x, T y) {
-    return x > y;
-  }
-};
-
-struct GreaterEqual {
-  template <typename T>
-  __device__ bool operator()(T x, T y) {
-    return x >= y;
-  }
-};
-
-struct Less {
-  template <typename T>
-  __device__ bool operator()(T x, T y) {
-    return x < y;
-  }
-};
-
-struct LessEqual {
-  template <typename T>
-  __device__ bool operator()(T x, T y) {
-    return x <= y;
-  }
-};
-
-struct LogAddExp {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    if constexpr (is_complex_v<T>) {
-      if (isnan(x.real()) || isnan(x.imag()) || isnan(y.real()) ||
-          isnan(y.imag())) {
-        return {
-            cuda::std::numeric_limits<float>::quiet_NaN(),
-            cuda::std::numeric_limits<float>::quiet_NaN()};
-      }
-      auto max = x.real() > y.real() ? x : y;
-      auto min = x.real() < y.real() ? x : y;
-      auto min_real = min.real();
-      auto max_real = max.real();
-      if (!isfinite(min_real) && (min_real == max_real)) {
-        if (min_real < 0) {
-          return min;
-        } else {
-          return Log{}(Exp{}(min) + Exp{}(max));
-        }
-      } else {
-        return Log1p{}(Exp{}(min - max)) + max;
-      }
-    } else {
-      if (isnan(x) || isnan(y)) {
-        return cuda::std::numeric_limits<T>::quiet_NaN();
-      }
-      T maxval = max(x, y);
-      T minval = min(x, y);
-      return (minval == -cuda::std::numeric_limits<T>::infinity() ||
-              maxval == cuda::std::numeric_limits<T>::infinity())
-          ? maxval
-          : T(float(maxval) + log1p(expf(minval - maxval)));
-    }
-  };
-};
-
-struct Maximum {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    if constexpr (cuda::std::is_integral_v<T>) {
-      return max(x, y);
-    } else if constexpr (is_complex_v<T>) {
-      if (isnan(x.real()) || isnan(x.imag())) {
-        return x;
-      }
-      return x > y ? x : y;
-    } else {
-      if (isnan(x)) {
-        return x;
-      }
-      return x > y ? x : y;
-    }
-  }
-};
-
-struct Minimum {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    if constexpr (cuda::std::is_integral_v<T>) {
-      return min(x, y);
-    } else if constexpr (is_complex_v<T>) {
-      if (isnan(x.real()) || isnan(x.imag())) {
-        return x;
-      }
-      return x < y ? x : y;
-    } else {
-      if (isnan(x)) {
-        return x;
-      }
-      return x < y ? x : y;
-    }
-  }
-};
-
-struct Multiply {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x * y;
-  }
-};
-
-struct NotEqual {
-  template <typename T>
-  __device__ bool operator()(T x, T y) {
-    if constexpr (is_complex_v<T>) {
-      return x.real() != y.real() || x.imag() != y.imag();
-    } else {
-      return x != y;
-    }
-  }
-};
-
-struct Power {
-  template <typename T>
-  __device__ T operator()(T base, T exp) {
-    if constexpr (cuda::std::is_integral_v<T>) {
-      T res = 1;
-      while (exp) {
-        if (exp & 1) {
-          res *= base;
-        }
-        exp >>= 1;
-        base *= base;
-      }
-      return res;
-    } else if constexpr (is_complex_v<T>) {
-      return pow(base, exp);
-    } else {
-      return powf(base, exp);
-    }
-  }
-};
-
-struct Subtract {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x - y;
-  }
-};
-
-struct LogicalAnd {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x && y;
-  };
-};
-
-struct LogicalOr {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x || y;
-  };
-};
-
-struct BitwiseAnd {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x & y;
-  };
-};
-
-struct BitwiseOr {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x | y;
-  };
-};
-
-struct BitwiseXor {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x ^ y;
-  };
-};
-
-struct LeftShift {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x << y;
-  };
-};
-
-struct RightShift {
-  template <typename T>
-  __device__ T operator()(T x, T y) {
-    return x >> y;
-  };
-};
-
-struct ArcTan2 {
-  template <typename T>
-  __device__ T operator()(T y, T x) {
-    return atan2f(y, x);
-  }
-};
-
-struct DivMod {
-  template <typename T>
-  __device__ cuda::std::array<T, 2> operator()(T x, T y) {
-    return {FloorDivide{}(x, y), Remainder{}(x, y)};
-  };
-};
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/cast_op.cuh
+++ b/mlx/backend/cuda/device/cast_op.cuh
@@ -1,130 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include "mlx/backend/cuda/device/complex.cuh"
-
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <thrust/iterator/transform_iterator.h>
-
-namespace mlx::core::cu {
-
-// An op that does static_cast, with custom conversions for some types.
-template <typename SrcT, typename DstT, typename = void>
-struct CastOp {
-  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, DstT>;
-
-  __device__ DstT operator()(SrcT x) {
-    return static_cast<DstT>(x);
-  }
-};
-
-// Castings between complex and boolean.
-template <typename T>
-struct CastOp<complex_t<T>, bool> {
-  static constexpr bool is_castable = true;
-
-  __device__ bool operator()(complex_t<T> x) {
-    return x.real() != 0 && x.imag() != 0;
-  }
-};
-
-template <typename T>
-struct CastOp<bool, complex_t<T>> {
-  static constexpr bool is_castable = true;
-
-  __device__ complex_t<T> operator()(bool x) {
-    return x ? complex_t<T>{1, 1} : complex_t<T>{0, 0};
-  }
-};
-
-// Converting a complex number to real number discards the imaginary part.
-template <typename T, typename DstT>
-struct CastOp<complex_t<T>, DstT, cuda::std::enable_if_t<!is_complex_v<DstT>>> {
-  static constexpr bool is_castable = cuda::std::is_convertible_v<T, DstT>;
-
-  __device__ DstT operator()(complex_t<T> x) {
-    static_assert(!is_complex_v<DstT>);
-    return static_cast<DstT>(x.real());
-  }
-};
-
-// Allow converting a real number to complex number.
-template <typename SrcT, typename T>
-struct CastOp<SrcT, complex_t<T>, cuda::std::enable_if_t<!is_complex_v<SrcT>>> {
-  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, T>;
-
-  __device__ complex_t<T> operator()(SrcT x) {
-    static_assert(!is_complex_v<SrcT>);
-    return complex_t<T>{static_cast<T>(x), 0};
-  }
-};
-
-// Do nothing when no casting is needed.
-template <typename SrcT, typename DstT>
-struct CastOp<
-    SrcT,
-    DstT,
-    cuda::std::enable_if_t<cuda::std::is_same_v<SrcT, DstT>>> {
-  static constexpr bool is_castable = true;
-
-  __device__ SrcT operator()(SrcT x) {
-    return x;
-  }
-};
-
-// In CUDA 11 the half types do not define conversions between some types,
-// provide fallbacks here.
-#if CUDART_VERSION < 12000
-template <typename SrcT, typename DstT>
-struct CastOp<
-    SrcT,
-    DstT,
-    cuda::std::enable_if_t<
-        !cuda::std::is_convertible_v<SrcT, DstT> && !is_complex_v<SrcT> &&
-        (cuda::std::is_same_v<DstT, __half> ||
-         cuda::std::is_same_v<DstT, __nv_bfloat16>)>> {
-  static constexpr bool is_castable = true;
-
-  __device__ DstT operator()(SrcT x) {
-    return DstT(static_cast<float>(x));
-  }
-};
-
-template <typename SrcT, typename DstT>
-struct CastOp<
-    SrcT,
-    DstT,
-    cuda::std::enable_if_t<
-        !cuda::std::is_convertible_v<SrcT, DstT> && !is_complex_v<SrcT> &&
-        !cuda::std::is_same_v<DstT, __half> &&
-        !cuda::std::is_same_v<DstT, __nv_bfloat16> &&
-        (cuda::std::is_same_v<SrcT, __half> ||
-         cuda::std::is_same_v<SrcT, __nv_bfloat16>)>> {
-  static constexpr bool is_castable = true;
-
-  __device__ DstT operator()(SrcT x) {
-    return DstT(static_cast<float>(x));
-  }
-};
-#endif // CUDART_VERSION < 12000
-
-// Helper to deduce the SrcT.
-template <typename DstT, typename SrcT>
-inline __host__ __device__ auto cast_to(SrcT x) {
-  return CastOp<SrcT, DstT>{}(x);
-}
-
-// Return an iterator that cast the value to DstT using CastOp.
-template <typename DstT, typename Iterator>
-inline __host__ __device__ auto make_cast_iterator(Iterator it) {
-  using SrcT = typename cuda::std::iterator_traits<Iterator>::value_type;
-  if constexpr (std::is_same_v<SrcT, DstT>) {
-    return it;
-  } else {
-    return thrust::make_transform_iterator(it, CastOp<SrcT, DstT>{});
-  }
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/complex.cuh
+++ b/mlx/backend/cuda/device/complex.cuh
@@ -1,60 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-// Make multiplication and division faster.
-#define LIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS
-
-#include <cuda/std/complex>
-#include <cuda/std/type_traits>
-
-namespace mlx::core::cu {
-
-// TODO: Consider using a faster implementation as cuda::std::complex has to
-// conform to C++ standard.
-template <typename T>
-using complex_t = cuda::std::complex<T>;
-
-using complex64_t = complex_t<float>;
-using complex128_t = complex_t<double>;
-
-template <typename T>
-struct is_complex : cuda::std::false_type {};
-
-template <typename T>
-struct is_complex<cuda::std::complex<T>> : cuda::std::true_type {};
-
-template <typename T>
-inline constexpr bool is_complex_v = is_complex<T>::value;
-
-// cuda::std::complex is missing some operators.
-template <typename T>
-inline __host__ __device__ complex_t<T> operator%(
-    complex_t<T> a,
-    complex_t<T> b) {
-  T r = a.real() - floor(a.real() / b.real()) * b.real();
-  T i = a.imag() - floor(a.imag() / b.imag()) * b.imag();
-  return complex_t<T>{r, i};
-}
-
-template <typename T>
-inline __host__ __device__ bool operator>(complex_t<T> a, complex_t<T> b) {
-  return (a.real() > b.real()) || (a.real() == b.real() && a.imag() > b.imag());
-}
-
-template <typename T>
-inline __host__ __device__ bool operator<(complex_t<T> a, complex_t<T> b) {
-  return operator>(b, a);
-}
-
-template <typename T>
-inline __host__ __device__ bool operator<=(complex_t<T> a, complex_t<T> b) {
-  return !(a > b);
-}
-
-template <typename T>
-inline __host__ __device__ bool operator>=(complex_t<T> a, complex_t<T> b) {
-  return !(a < b);
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/config.h
+++ b/mlx/backend/cuda/device/config.h
@@ -1,12 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-// This file is used by both CUDA kernel code and host-only C++ code.
-
-#pragma once
-
-// The maximum dimensions of shape/strides passed as kernel parameters.
-#define MAX_NDIM 10
-
-// All existing NVIDIA hardware has a fixed 32 warp size. Though a built-in
-// warpSize variable exists, using it would prevent compile-time optimizations.
-#define WARP_SIZE 32
--- a/mlx/backend/cuda/device/fp16_math.cuh
+++ b/mlx/backend/cuda/device/fp16_math.cuh
@@ -1,194 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
-
-namespace mlx::core::cu {
-
-///////////////////////////////////////////////////////////////////////////////
-// Unary ops for half types.
-///////////////////////////////////////////////////////////////////////////////
-
-#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
-#define MLX_DEFINE_UNARY_OP(NAME, HALF_OP)           \
-  template <typename T>                              \
-  __forceinline__ __device__ auto NAME(T x) {        \
-    if constexpr (cuda::std::is_same_v<T, __half>) { \
-      return HALF_OP(x);                             \
-    } else {                                         \
-      return ::NAME(x);                              \
-    }                                                \
-  }
-#else
-#define MLX_DEFINE_UNARY_OP(NAME, HALF_OP)                         \
-  template <typename T>                                            \
-  __forceinline__ __device__ auto NAME(T x) {                      \
-    if constexpr (cuda::std::is_same_v<T, __half>) {               \
-      return HALF_OP(x);                                           \
-    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) { \
-      return HALF_OP(x);                                           \
-    } else {                                                       \
-      return ::NAME(x);                                            \
-    }                                                              \
-  }
-#endif
-
-#define MLX_DEFINE_UNARY_OP_FALLBCK(NAME)                          \
-  template <typename T>                                            \
-  __forceinline__ __device__ auto NAME(T x) {                      \
-    if constexpr (cuda::std::is_same_v<T, __half>) {               \
-      return ::NAME(__half2float(x));                              \
-    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) { \
-      return ::NAME(__bfloat162float(x));                          \
-    } else {                                                       \
-      return ::NAME(x);                                            \
-    }                                                              \
-  }
-
-MLX_DEFINE_UNARY_OP(abs, __habs)
-MLX_DEFINE_UNARY_OP(ceil, hceil)
-MLX_DEFINE_UNARY_OP(cos, hcos)
-MLX_DEFINE_UNARY_OP(exp, hexp)
-MLX_DEFINE_UNARY_OP(floor, hfloor)
-MLX_DEFINE_UNARY_OP(isnan, __hisnan)
-MLX_DEFINE_UNARY_OP(log, hlog)
-MLX_DEFINE_UNARY_OP(log2, hlog2)
-MLX_DEFINE_UNARY_OP(log10, hlog10)
-MLX_DEFINE_UNARY_OP(rint, hrint)
-MLX_DEFINE_UNARY_OP(rsqrt, hrsqrt)
-MLX_DEFINE_UNARY_OP(sin, hsin)
-MLX_DEFINE_UNARY_OP(sqrt, hsqrt)
-MLX_DEFINE_UNARY_OP_FALLBCK(acos)
-MLX_DEFINE_UNARY_OP_FALLBCK(acosh)
-MLX_DEFINE_UNARY_OP_FALLBCK(asin)
-MLX_DEFINE_UNARY_OP_FALLBCK(asinh)
-MLX_DEFINE_UNARY_OP_FALLBCK(atan)
-MLX_DEFINE_UNARY_OP_FALLBCK(atanh)
-MLX_DEFINE_UNARY_OP_FALLBCK(cosh)
-MLX_DEFINE_UNARY_OP_FALLBCK(log1p)
-MLX_DEFINE_UNARY_OP_FALLBCK(sinh)
-MLX_DEFINE_UNARY_OP_FALLBCK(tan)
-#if __CUDA_ARCH__ >= 1280
-MLX_DEFINE_UNARY_OP(tanh, htanh)
-#else
-MLX_DEFINE_UNARY_OP_FALLBCK(tanh)
-#endif
-
-#undef MLX_DEFINE_UNARY_OP
-#undef MLX_DEFINE_UNARY_OP_FALLBCK
-
-///////////////////////////////////////////////////////////////////////////////
-// Binary ops for half types.
-///////////////////////////////////////////////////////////////////////////////
-
-#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
-#define MLX_DEFINE_BINARY_OP(NAME, HALF_OP)          \
-  template <typename T>                              \
-  __forceinline__ __device__ auto NAME(T x, T y) {   \
-    if constexpr (cuda::std::is_same_v<T, __half>) { \
-      return HALF_OP(x, y);                          \
-    } else {                                         \
-      return ::NAME(x, y);                           \
-    }                                                \
-  }
-#else
-#define MLX_DEFINE_BINARY_OP(NAME, HALF_OP)                        \
-  template <typename T>                                            \
-  __forceinline__ __device__ auto NAME(T x, T y) {                 \
-    if constexpr (cuda::std::is_same_v<T, __half>) {               \
-      return HALF_OP(x, y);                                        \
-    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) { \
-      return HALF_OP(x, y);                                        \
-    } else {                                                       \
-      return ::NAME(x, y);                                         \
-    }                                                              \
-  }
-#endif
-
-MLX_DEFINE_BINARY_OP(max, __hmax)
-MLX_DEFINE_BINARY_OP(min, __hmin)
-
-#undef MLX_DEFINE_BINARY_OP
-
-template <typename T>
-__forceinline__ __device__ T fmod(T x, T y) {
-  if constexpr (cuda::std::is_same_v<T, __half>) {
-    return __float2half(::fmod(__half2float(x), __half2float(y)));
-#if CUDART_VERSION >= 12000 || __CUDA_ARCH__ >= 800
-  } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
-    return __float2bfloat16(::fmod(__bfloat162float(x), __bfloat162float(y)));
-#endif
-  } else {
-    return ::fmod(x, y);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Additional C++ operator overrides between half types and native types.
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename T, typename U>
-constexpr bool is_integral_except =
-    cuda::std::is_integral_v<T> && !cuda::std::is_same_v<T, U>;
-
-template <typename T, typename U>
-constexpr bool is_arithmetic_except =
-    cuda::std::is_arithmetic_v<T> && !cuda::std::is_same_v<T, U>;
-
-#define MLX_DEFINE_HALF_OP(HALF, HALF2FLOAT, FLOAT2HALF, OP)          \
-  template <                                                          \
-      typename T,                                                     \
-      typename = cuda::std::enable_if_t<is_integral_except<T, HALF>>> \
-  __forceinline__ __device__ HALF operator OP(HALF x, T y) {          \
-    return FLOAT2HALF(HALF2FLOAT(x) OP static_cast<float>(y));        \
-  }                                                                   \
-  template <                                                          \
-      typename T,                                                     \
-      typename = cuda::std::enable_if_t<is_integral_except<T, HALF>>> \
-  __forceinline__ __device__ HALF operator OP(T x, HALF y) {          \
-    return FLOAT2HALF(static_cast<float>(x) OP HALF2FLOAT(y));        \
-  }
-
-#define MLX_DEFINE_HALF_CMP(HALF, HALF2FLOAT, OP)                       \
-  template <                                                            \
-      typename T,                                                       \
-      typename = cuda::std::enable_if_t<is_arithmetic_except<T, HALF>>> \
-  __forceinline__ __device__ bool operator OP(HALF x, T y) {            \
-    return HALF2FLOAT(x) OP static_cast<float>(y);                      \
-  }                                                                     \
-  template <                                                            \
-      typename T,                                                       \
-      typename = cuda::std::enable_if_t<is_arithmetic_except<T, HALF>>> \
-  __forceinline__ __device__ bool operator OP(T x, HALF y) {            \
-    return static_cast<float>(y) OP HALF2FLOAT(x);                      \
-  }
-
-MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, +)
-MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, -)
-MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, *)
-MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, /)
-MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, +)
-MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, -)
-MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, *)
-MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, /)
-MLX_DEFINE_HALF_CMP(__half, __half2float, <)
-MLX_DEFINE_HALF_CMP(__half, __half2float, >)
-MLX_DEFINE_HALF_CMP(__half, __half2float, <=)
-MLX_DEFINE_HALF_CMP(__half, __half2float, >=)
-MLX_DEFINE_HALF_CMP(__half, __half2float, ==)
-MLX_DEFINE_HALF_CMP(__half, __half2float, !=)
-MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, <)
-MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, >)
-MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, <=)
-MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, >=)
-MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, ==)
-MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, !=)
-
-#undef MLX_DEFINE_HALF_OP
-#undef MLX_DEFINE_HALF_CMP
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/gather.cuh
+++ b/mlx/backend/cuda/device/gather.cuh
@@ -1,53 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/device/indexing.cuh"
-#include "mlx/backend/cuda/device/utils.cuh"
-
-#include <cooperative_groups.h>
-
-namespace mlx::core::cu {
-
-namespace cg = cooperative_groups;
-
-template <typename T, typename IdxT, int NIDX, int IDX_NDIM, typename LocT>
-__global__ void gather(
-    const T* src,
-    T* out,
-    LocT size,
-    const __grid_constant__ Shape src_shape,
-    const __grid_constant__ Strides src_strides,
-    int32_t src_ndim,
-    const __grid_constant__ Shape slice_sizes,
-    uint32_t slice_size,
-    const __grid_constant__ cuda::std::array<int32_t, NIDX> axes,
-    const __grid_constant__ cuda::std::array<IdxT*, NIDX> indices,
-    const __grid_constant__ cuda::std::array<int32_t, NIDX * IDX_NDIM>
-        indices_shape,
-    const __grid_constant__ cuda::std::array<int64_t, NIDX * IDX_NDIM>
-        indices_strides) {
-  LocT out_idx = cg::this_grid().thread_rank();
-  if (out_idx >= size) {
-    return;
-  }
-
-  LocT src_elem = out_idx % slice_size;
-  LocT idx_elem = out_idx / slice_size;
-
-  LocT src_loc =
-      elem_to_loc(src_elem, slice_sizes.data(), src_strides.data(), src_ndim);
-
-#pragma unroll
-  for (int i = 0; i < NIDX; ++i) {
-    LocT idx_loc = elem_to_loc_nd<IDX_NDIM>(
-        idx_elem,
-        indices_shape.data() + i * IDX_NDIM,
-        indices_strides.data() + i * IDX_NDIM);
-    int32_t axis = axes[i];
-    LocT idx_val = absolute_index(indices[i][idx_loc], src_shape[axis]);
-    src_loc += idx_val * src_strides[axis];
-  }
-
-  out[out_idx] = src[src_loc];
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/gather_axis.cuh
+++ b/mlx/backend/cuda/device/gather_axis.cuh
@@ -1,65 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/device/indexing.cuh"
-#include "mlx/backend/cuda/device/utils.cuh"
-
-#include <cooperative_groups.h>
-
-namespace mlx::core::cu {
-
-namespace cg = cooperative_groups;
-
-template <
-    typename T,
-    typename IdxT,
-    int NDIM,
-    bool SrcC,
-    bool IdxC,
-    typename LocT>
-__global__ void gather_axis(
-    const T* src,
-    const IdxT* indices,
-    T* out,
-    LocT idx_size_pre,
-    LocT idx_size_axis,
-    LocT idx_size_post,
-    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> src_strides,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> idx_strides,
-    int32_t axis,
-    int32_t axis_size,
-    int64_t src_stride_axis,
-    int64_t idx_stride_axis) {
-  LocT index = cg::this_grid().thread_rank();
-  if (index >= idx_size_pre * idx_size_axis * idx_size_post) {
-    return;
-  }
-
-  auto [x, y, z] = index_to_dims(index, idx_size_axis, idx_size_pre);
-
-  LocT elem_idx = z * idx_size_post;
-
-  LocT idx_loc = y * idx_stride_axis;
-  if constexpr (IdxC) {
-    idx_loc += elem_idx * idx_size_axis + x;
-  } else {
-    idx_loc +=
-        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), idx_strides.data());
-  }
-
-  auto idx_val = absolute_index(indices[idx_loc], axis_size);
-
-  LocT src_loc = idx_val * src_stride_axis;
-  if constexpr (SrcC) {
-    src_loc += elem_idx * axis_size + x;
-  } else {
-    src_loc +=
-        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), src_strides.data());
-  }
-
-  LocT out_idx = y * idx_size_post + elem_idx * idx_size_axis + x;
-
-  out[out_idx] = src[src_loc];
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/indexing.cuh
+++ b/mlx/backend/cuda/device/indexing.cuh
@@ -1,30 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include <cuda/std/tuple>
-#include <cuda/std/type_traits>
-
-namespace mlx::core::cu {
-
-// Convert an absolute index to positions in a 3d grid, assuming the index is
-// calculated with:
-// index = x * dim1 * dim2 + y * dim2 + z
-template <typename T>
-inline __host__ __device__ cuda::std::tuple<T, T, T>
-index_to_dims(T index, T dim1, T dim2) {
-  T x = index / (dim1 * dim2);
-  T y = (index % (dim1 * dim2)) / dim2;
-  T z = index % dim2;
-  return cuda::std::make_tuple(x, y, z);
-}
-
-// Get absolute index from possible negative index.
-template <typename IdxT>
-inline __host__ __device__ auto absolute_index(IdxT idx, int32_t size) {
-  if constexpr (cuda::std::is_unsigned_v<IdxT>) {
-    return idx;
-  } else {
-    return static_cast<int32_t>(idx < 0 ? idx + size : idx);
-  }
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/scatter.cuh
+++ b/mlx/backend/cuda/device/scatter.cuh
@@ -1,68 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/device/indexing.cuh"
-#include "mlx/backend/cuda/device/scatter_ops.cuh"
-#include "mlx/backend/cuda/device/utils.cuh"
-
-#include <cooperative_groups.h>
-
-namespace mlx::core::cu {
-
-namespace cg = cooperative_groups;
-
-template <
-    typename T,
-    typename IdxT,
-    typename Op,
-    int NIDX,
-    int IDX_NDIM,
-    typename LocT>
-__global__ void scatter(
-    const T* upd,
-    T* out,
-    LocT size,
-    const __grid_constant__ Shape upd_shape,
-    const __grid_constant__ Strides upd_strides,
-    int32_t upd_ndim,
-    LocT upd_post_idx_size,
-    const __grid_constant__ Shape out_shape,
-    const __grid_constant__ Strides out_strides,
-    int32_t out_ndim,
-    const __grid_constant__ cuda::std::array<int32_t, NIDX> axes,
-    const __grid_constant__ cuda::std::array<IdxT*, NIDX> indices,
-    const __grid_constant__ cuda::std::array<int32_t, NIDX * IDX_NDIM>
-        indices_shape,
-    const __grid_constant__ cuda::std::array<int64_t, NIDX * IDX_NDIM>
-        indices_strides) {
-  LocT upd_idx = cg::this_grid().thread_rank();
-  if (upd_idx >= size) {
-    return;
-  }
-
-  LocT out_elem = upd_idx % upd_post_idx_size;
-  LocT idx_elem = upd_idx / upd_post_idx_size;
-
-  LocT out_idx = elem_to_loc(
-      out_elem, upd_shape.data() + IDX_NDIM, out_strides.data(), out_ndim);
-
-#pragma unroll
-  for (int i = 0; i < NIDX; ++i) {
-    LocT idx_loc = elem_to_loc_nd<IDX_NDIM>(
-        idx_elem,
-        indices_shape.data() + i * IDX_NDIM,
-        indices_strides.data() + i * IDX_NDIM);
-    int32_t axis = axes[i];
-    LocT idx_val = absolute_index(indices[i][idx_loc], out_shape[axis]);
-    out_idx += idx_val * out_strides[axis];
-  }
-
-  LocT upd_loc = elem_to_loc(
-      out_elem + idx_elem * upd_post_idx_size,
-      upd_shape.data(),
-      upd_strides.data(),
-      upd_ndim);
-
-  Op{}(out + out_idx, upd[upd_loc]);
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/scatter_axis.cuh
+++ b/mlx/backend/cuda/device/scatter_axis.cuh
@@ -1,67 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/device/indexing.cuh"
-#include "mlx/backend/cuda/device/scatter_ops.cuh"
-#include "mlx/backend/cuda/device/utils.cuh"
-
-#include <cooperative_groups.h>
-
-namespace mlx::core::cu {
-
-namespace cg = cooperative_groups;
-
-template <
-    typename T,
-    typename IdxT,
-    typename Op,
-    int NDIM,
-    bool UpdC,
-    bool IdxC,
-    typename LocT>
-__global__ void scatter_axis(
-    const T* upd,
-    const IdxT* indices,
-    T* out,
-    LocT idx_size_pre,
-    LocT idx_size_axis,
-    LocT idx_size_post,
-    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> upd_strides,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> idx_strides,
-    int32_t axis,
-    int32_t axis_size,
-    int64_t upd_stride_axis,
-    int64_t idx_stride_axis) {
-  LocT index = cg::this_grid().thread_rank();
-  if (index >= idx_size_pre * idx_size_axis * idx_size_post) {
-    return;
-  }
-
-  auto [x, y, z] = index_to_dims(index, idx_size_axis, idx_size_pre);
-
-  LocT elem_idx = z * idx_size_post;
-
-  LocT idx_loc = y * idx_stride_axis;
-  if constexpr (IdxC) {
-    idx_loc += elem_idx * idx_size_axis + x;
-  } else {
-    idx_loc +=
-        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), idx_strides.data());
-  }
-
-  auto idx_val = absolute_index(indices[idx_loc], axis_size);
-
-  LocT upd_loc = y * upd_stride_axis;
-  if constexpr (UpdC) {
-    upd_loc += elem_idx * idx_size_axis + x;
-  } else {
-    upd_loc +=
-        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), upd_strides.data());
-  }
-
-  LocT out_idx = idx_val * idx_size_post + elem_idx * axis_size + x;
-
-  Op{}(out + out_idx, upd[upd_loc]);
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/scatter_ops.cuh
+++ b/mlx/backend/cuda/device/scatter_ops.cuh
@@ -1,44 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include "mlx/backend/cuda/device/atomic_ops.cuh"
-
-namespace mlx::core::cu {
-
-struct ScatterAssign {
-  template <typename T>
-  __device__ void operator()(T* out, T val) const {
-    *out = val;
-  }
-};
-
-struct ScatterSum {
-  template <typename T>
-  __device__ void operator()(T* out, T val) const {
-    atomic_add(out, val);
-  }
-};
-
-struct ScatterProd {
-  template <typename T>
-  __device__ void operator()(T* out, T val) const {
-    atomic_prod(out, val);
-  }
-};
-
-struct ScatterMax {
-  template <typename T>
-  __device__ void operator()(T* out, T val) const {
-    atomic_max(out, val);
-  }
-};
-
-struct ScatterMin {
-  template <typename T>
-  __device__ void operator()(T* out, T val) const {
-    atomic_min(out, val);
-  }
-};
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/ternary_ops.cuh
+++ b/mlx/backend/cuda/device/ternary_ops.cuh
@@ -1,13 +0,0 @@
-// Copyright © 2025 Apple Inc.
-#pragma once
-
-namespace mlx::core::cu {
-
-struct Select {
-  template <typename T>
-  __device__ T operator()(bool condition, T x, T y) {
-    return condition ? x : y;
-  }
-};
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/unary_ops.cuh
+++ b/mlx/backend/cuda/device/unary_ops.cuh
@@ -1,337 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include "mlx/backend/cuda/device/fp16_math.cuh"
-#include "mlx/backend/cuda/device/utils.cuh"
-
-#include <math_constants.h>
-
-namespace mlx::core::cu {
-
-struct Abs {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_unsigned_v<T>) {
-      return x;
-    } else {
-      return abs(x);
-    }
-  }
-};
-
-struct ArcCos {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return acos(x);
-  }
-};
-
-struct ArcCosh {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return acosh(x);
-  }
-};
-
-struct ArcSin {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return asin(x);
-  }
-};
-
-struct ArcSinh {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return asinh(x);
-  }
-};
-
-struct ArcTan {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return atan(x);
-  }
-};
-
-struct ArcTanh {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return atanh(x);
-  }
-};
-
-struct BitwiseInvert {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return ~x;
-  }
-};
-
-struct Ceil {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_integral_v<T>) {
-      return x;
-    } else if constexpr (is_complex_v<T>) {
-      return T{ceil(x.real()), ceil(x.imag())};
-    } else {
-      return ceil(x);
-    }
-  }
-};
-
-struct Conjugate {
-  template <typename T>
-  __device__ complex_t<T> operator()(complex_t<T> x) {
-    return conj(x);
-  }
-};
-
-struct Cos {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return cos(x);
-  }
-};
-
-struct Cosh {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return cosh(x);
-  }
-};
-
-struct Erf {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, __half>) {
-      return erf(__half2float(x));
-    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
-      return erf(__bfloat162float(x));
-    } else {
-      return erf(x);
-    }
-  }
-};
-
-struct ErfInv {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, __half>) {
-      return erfinv(__half2float(x));
-    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
-      return erfinv(__bfloat162float(x));
-    } else {
-      return erfinv(x);
-    }
-  }
-};
-
-struct Exp {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return exp(x);
-  }
-};
-
-struct Expm1 {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, __half>) {
-      return expm1(__half2float(x));
-    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
-      return expm1(__bfloat162float(x));
-    } else {
-      return expm1(x);
-    }
-  }
-};
-
-struct Floor {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_integral_v<T>) {
-      return x;
-    } else if constexpr (is_complex_v<T>) {
-      return T{floor(x.real()), floor(x.imag())};
-    } else {
-      return floor(x);
-    }
-  }
-};
-
-struct Imag {
-  template <typename T>
-  __device__ auto operator()(complex_t<T> x) {
-    return x.imag();
-  }
-};
-
-struct Log {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return log(x);
-  }
-};
-
-struct Log2 {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (is_complex_v<T>) {
-      auto y = Log{}(x);
-      return {y.real() / CUDART_LN2_F, y.imag() / CUDART_LN2_F};
-    } else {
-      return log2(x);
-    }
-  }
-};
-
-struct Log10 {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return log10(x);
-  }
-};
-
-struct Log1p {
-  template <typename T>
-  __device__ T operator()(T z) {
-    if constexpr (is_complex_v<T>) {
-      float x = z.real();
-      float y = z.imag();
-      float zabs = Abs{}(z).real();
-      float theta = atan2f(y, x + 1);
-      if (zabs < 0.5f) {
-        float r = x * (2 + x) + y * y;
-        if (r == 0) { // handle underflow
-          return {x, theta};
-        }
-        return {0.5f * log1pf(r), theta};
-      } else {
-        float z0 = hypotf(x + 1, y);
-        return {logf(z0), theta};
-      }
-    } else {
-      return log1p(z);
-    }
-  }
-};
-
-struct LogicalNot {
-  __device__ bool operator()(bool x) {
-    return !x;
-  }
-};
-
-struct Negative {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (is_complex_v<T>) {
-      return T{0, 0} - x;
-    } else {
-      return -x;
-    }
-  }
-};
-
-struct Real {
-  template <typename T>
-  __device__ auto operator()(complex_t<T> x) {
-    return x.real();
-  }
-};
-
-struct Round {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (is_complex_v<T>) {
-      return {rint(x.real()), rint(x.imag())};
-    } else {
-      return rint(x);
-    }
-  }
-};
-
-struct Sigmoid {
-  template <typename T>
-  __device__ T operator()(T x) {
-    T y = 1 / (1 + exp(-abs(x)));
-    return (x < 0) ? 1 - y : y;
-  }
-};
-
-struct Sign {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_unsigned_v<T>) {
-      return x != 0;
-    } else if constexpr (is_complex_v<T>) {
-      if (x.real() == 0 && x.imag() == 0) {
-        return x;
-      } else {
-        return x / Abs()(x);
-      }
-    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
-      return static_cast<float>((x > T(0.f)) - (x < T(0.f)));
-    } else {
-      return (x > T(0)) - (x < T(0));
-    }
-  }
-};
-
-struct Sin {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return sin(x);
-  }
-};
-
-struct Sinh {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return sinh(x);
-  }
-};
-
-struct Square {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return x * x;
-  }
-};
-
-struct Sqrt {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return sqrt(x);
-  }
-};
-
-struct Rsqrt {
-  template <typename T>
-  __device__ T operator()(T x) {
-    if constexpr (is_complex_v<T>) {
-      return 1.0f / Sqrt{}(x);
-    } else {
-      return rsqrt(x);
-    }
-  }
-};
-
-struct Tan {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return tan(x);
-  }
-};
-
-struct Tanh {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return tanh(x);
-  }
-};
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/utils.cuh
+++ b/mlx/backend/cuda/device/utils.cuh
@@ -1,362 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-// This file must not include any host-only code, utilies that work under both
-// host and device can be put here.
-//
-// See more about the requirements at:
-// https://docs.nvidia.com/cuda/nvrtc/#language
-
-#pragma once
-
-#include "mlx/backend/cuda/device/complex.cuh"
-#include "mlx/backend/cuda/device/config.h"
-
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda/std/array>
-#include <cuda/std/limits>
-#include <cuda/std/tuple>
-
-namespace mlx::core::cu {
-
-///////////////////////////////////////////////////////////////////////////////
-// CUDA kernel utils
-///////////////////////////////////////////////////////////////////////////////
-
-// To pass shape/strides to kernels via constant memory, their size must be
-// known at compile time.
-using Shape = cuda::std::array<int32_t, MAX_NDIM>;
-using Strides = cuda::std::array<int64_t, MAX_NDIM>;
-
-// Vectorized load/store.
-template <typename T, int N>
-struct alignas(sizeof(T) * N) AlignedVector {
-  T val[N];
-};
-
-template <int N, typename T>
-inline __device__ AlignedVector<T, N> load_vector(
-    const T* ptr,
-    uint32_t offset) {
-  auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
-  return from[offset];
-}
-
-template <int N, typename T>
-inline __device__ void
-store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
-  auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
-  to[offset] = vec;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Type limits utils
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename T, typename = void>
-struct Limits {
-  static constexpr __host__ __device__ T max() {
-    return cuda::std::numeric_limits<T>::max();
-  }
-  static constexpr __host__ __device__ T min() {
-    return cuda::std::numeric_limits<T>::min();
-  }
-  static constexpr __host__ __device__ T finite_max() {
-    return cuda::std::numeric_limits<T>::max();
-  }
-  static constexpr __host__ __device__ T finite_min() {
-    return cuda::std::numeric_limits<T>::min();
-  }
-};
-
-template <typename T>
-struct Limits<
-    T,
-    cuda::std::enable_if_t<
-        cuda::std::is_same_v<T, float> || cuda::std::is_same_v<T, double>>> {
-  static constexpr __host__ __device__ T max() {
-    return cuda::std::numeric_limits<T>::infinity();
-  }
-  static constexpr __host__ __device__ T min() {
-    return -cuda::std::numeric_limits<T>::infinity();
-  }
-  static constexpr __host__ __device__ T finite_max() {
-    return cuda::std::numeric_limits<T>::max();
-  }
-  static constexpr __host__ __device__ T finite_min() {
-    return cuda::std::numeric_limits<T>::lowest();
-  }
-};
-
-// CUDA 11 does not have host side arithmatic operators for half types.
-template <typename T>
-struct Limits<
-    T,
-    cuda::std::enable_if_t<
-        cuda::std::is_same_v<T, __half> ||
-        cuda::std::is_same_v<T, __nv_bfloat16>>> {
-  static constexpr __host__ __device__ T max() {
-    return cuda::std::numeric_limits<T>::infinity();
-  }
-  static constexpr __host__ __device__ T min() {
-#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
-    return -cuda::std::numeric_limits<float>::infinity();
-#else
-    return -cuda::std::numeric_limits<T>::infinity();
-#endif
-  }
-  static constexpr __host__ __device__ T finite_max() {
-    return cuda::std::numeric_limits<T>::max();
-  }
-  static constexpr __host__ __device__ T finite_min() {
-#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
-    return cuda::std::numeric_limits<float>::lowest();
-#else
-    return cuda::std::numeric_limits<T>::lowest();
-#endif
-  }
-};
-
-template <>
-struct Limits<bool> {
-  static constexpr __host__ __device__ bool max() {
-    return true;
-  }
-  static constexpr __host__ __device__ bool min() {
-    return false;
-  }
-};
-
-template <typename T>
-struct Limits<complex_t<T>> {
-  static constexpr __host__ __device__ complex_t<T> max() {
-    return {Limits<T>::max(), Limits<T>::max()};
-  }
-  static constexpr __host__ __device__ complex_t<T> min() {
-    return {Limits<T>::min(), Limits<T>::min()};
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Indexing utils
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename IdxT = int64_t>
-inline __host__ __device__ IdxT
-elem_to_loc(IdxT elem, const int* shape, const int64_t* strides, int ndim) {
-  IdxT loc = 0;
-  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
-    loc += (elem % shape[i]) * IdxT(strides[i]);
-    elem /= shape[i];
-  }
-  return loc;
-}
-
-// Optimize when the ndim is known at compile time.
-template <int NDIM, typename IdxT = int64_t>
-inline __host__ __device__ IdxT
-elem_to_loc_nd(IdxT elem, const int* shape, const int64_t* strides) {
-  IdxT loc = 0;
-#pragma unroll
-  for (int i = NDIM - 1; i >= 0; --i) {
-    loc += (elem % shape[i]) * IdxT(strides[i]);
-    elem /= shape[i];
-  }
-  return loc;
-}
-
-template <int NDIM, typename IdxT = int64_t>
-inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc_nd(
-    IdxT elem,
-    const int* shape,
-    const int64_t* a_strides,
-    const int64_t* b_strides) {
-  IdxT a_loc = 0;
-  IdxT b_loc = 0;
-#pragma unroll
-  for (int i = NDIM - 1; i >= 0; --i) {
-    int dim_idx = elem % shape[i];
-    a_loc += dim_idx * IdxT(a_strides[i]);
-    b_loc += dim_idx * IdxT(b_strides[i]);
-    elem /= shape[i];
-  }
-  return cuda::std::make_tuple(a_loc, b_loc);
-}
-
-template <int NDIM, typename IdxT = int64_t>
-inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_nd(
-    IdxT elem,
-    const int* shape,
-    const int64_t* a_strides,
-    const int64_t* b_strides,
-    const int64_t* c_strides) {
-  IdxT a_loc = 0;
-  IdxT b_loc = 0;
-  IdxT c_loc = 0;
-#pragma unroll
-  for (int i = NDIM - 1; i >= 0; --i) {
-    int dim_idx = elem % shape[i];
-    a_loc += dim_idx * IdxT(a_strides[i]);
-    b_loc += dim_idx * IdxT(b_strides[i]);
-    c_loc += dim_idx * IdxT(c_strides[i]);
-    elem /= shape[i];
-  }
-  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
-}
-
-// Optimized version when ndim is larger than 4.
-template <typename IdxT = int64_t>
-inline __host__ __device__ IdxT
-elem_to_loc_4d(IdxT elem, const int* shape, const int64_t* strides, int ndim) {
-  IdxT loc = 0;
-  for (int i = ndim - 1; i >= 0; --i) {
-    loc += (elem % shape[i]) * IdxT(strides[i]);
-    elem /= shape[i];
-  }
-  return loc;
-}
-
-template <typename IdxT = int64_t>
-inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc_4d(
-    IdxT elem,
-    const int* shape,
-    const int64_t* a_strides,
-    const int64_t* b_strides,
-    int ndim) {
-  IdxT a_loc = 0;
-  IdxT b_loc = 0;
-  for (int i = ndim - 1; i >= 0; --i) {
-    int dim_idx = elem % shape[i];
-    a_loc += dim_idx * IdxT(a_strides[i]);
-    b_loc += dim_idx * IdxT(b_strides[i]);
-    elem /= shape[i];
-  }
-  return cuda::std::make_tuple(a_loc, b_loc);
-}
-
-template <typename IdxT = int64_t>
-inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_4d(
-    IdxT elem,
-    const int* shape,
-    const int64_t* a_strides,
-    const int64_t* b_strides,
-    const int64_t* c_strides,
-    int ndim) {
-  IdxT a_loc = 0;
-  IdxT b_loc = 0;
-  IdxT c_loc = 0;
-  for (int i = ndim - 1; i >= 0; --i) {
-    int dim_idx = elem % shape[i];
-    a_loc += dim_idx * IdxT(a_strides[i]);
-    b_loc += dim_idx * IdxT(b_strides[i]);
-    c_loc += dim_idx * IdxT(c_strides[i]);
-    elem /= shape[i];
-  }
-  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Elem to loc in a loop utils
-///////////////////////////////////////////////////////////////////////////////
-
-template <int DIM, bool General = true, typename OffsetT = size_t>
-struct LoopedElemToLoc {
-  int dim;
-  LoopedElemToLoc<DIM - 1, General, OffsetT> inner_looper;
-  OffsetT offset{0};
-  int index{0};
-
-  __device__ LoopedElemToLoc(int dim) : dim(dim), inner_looper(dim - 1) {}
-
-  __device__ void next(const int* shape, const int64_t* strides) {
-    if (dim == 0) {
-      return;
-    }
-    index++;
-    offset += OffsetT(strides[dim - 1]);
-    if (index >= shape[dim - 1]) {
-      index = 0;
-      inner_looper.next(shape, strides);
-      offset = inner_looper.offset;
-    }
-  }
-
-  __device__ void next(int n, const int* shape, const int64_t* strides) {
-    if (dim == 0) {
-      return;
-    }
-    index += n;
-    offset += n * OffsetT(strides[dim - 1]);
-
-    if (index >= shape[dim - 1]) {
-      int extra = index - shape[dim - 1];
-      if (extra >= shape[dim - 1]) {
-        inner_looper.next(1 + extra / shape[dim - 1], shape, strides);
-        extra = extra % shape[dim - 1];
-      } else {
-        inner_looper.next(shape, strides);
-      }
-      index = 0;
-      offset = inner_looper.offset;
-      if (extra > 0) {
-        next(extra, shape, strides);
-      }
-    }
-  }
-
-  __device__ OffsetT location() {
-    return offset;
-  }
-};
-
-template <typename OffsetT>
-struct LoopedElemToLoc<1, true, OffsetT> {
-  int dim;
-  OffsetT offset{0};
-  int index{0};
-
-  __device__ LoopedElemToLoc(int dim) : dim(dim) {}
-
-  __device__ void next(const int* shape, const int64_t* strides) {
-    index++;
-    if (dim > 1) {
-      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
-    } else {
-      offset += OffsetT(strides[0]);
-    }
-  }
-
-  __device__ void next(int n, const int* shape, const int64_t* strides) {
-    index += n;
-    if (dim > 1) {
-      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
-    } else {
-      offset = index * OffsetT(strides[0]);
-    }
-  }
-
-  __device__ OffsetT location() {
-    return offset;
-  }
-};
-
-template <typename OffsetT>
-struct LoopedElemToLoc<1, false, OffsetT> {
-  OffsetT offset{0};
-
-  __device__ LoopedElemToLoc(int) {}
-
-  __device__ void next(const int*, const int64_t* strides) {
-    offset += OffsetT(strides[0]);
-  }
-
-  __device__ void next(int n, const int*, const int64_t* strides) {
-    offset += n * OffsetT(strides[0]);
-  }
-
-  __device__ OffsetT location() {
-    return offset;
-  }
-};
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/dtype_utils.cuh
+++ b/mlx/backend/cuda/dtype_utils.cuh
@@ -0,0 +1,35 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <cuComplex.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace mlx::core {
+
+// Maps CPU types to CUDA types.
+template <typename T>
+struct CTypeToCudaType {
+  using type = T;
+};
+
+template <>
+struct CTypeToCudaType<float16_t> {
+  using type = __half;
+};
+
+template <>
+struct CTypeToCudaType<bfloat16_t> {
+  using type = __nv_bfloat16;
+};
+
+template <>
+struct CTypeToCudaType<complex64_t> {
+  using type = cuComplex;
+};
+
+template <typename T>
+using cuda_type_t = typename CTypeToCudaType<T>::type;
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@@ -37,20 +37,22 @@ void eval(array& arr) {
  }

  auto& encoder = cu::get_command_encoder(arr.primitive().stream());
-  // Keep used buffers alive until kernel finishes running.
-  std::unordered_set<std::shared_ptr<array::Data>> buffers;
-  for (auto& in : arr.inputs()) {
-    buffers.insert(in.data_shared_ptr());
+  if (encoder.has_gpu_work()) {
+    // Keep used buffers alive until kernel finishes running.
+    std::unordered_set<std::shared_ptr<array::Data>> buffers;
+    for (auto& in : arr.inputs()) {
+      buffers.insert(in.data_shared_ptr());
+    }
+    for (auto& s : arr.siblings()) {
+      buffers.insert(s.data_shared_ptr());
+    }
+    // Remove the output if it was donated to by an input.
+    if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
+      buffers.erase(it);
+    }
+    encoder.add_completed_handler([buffers = std::move(buffers)]() {});
  }
-  for (auto& s : arr.siblings()) {
-    buffers.insert(s.data_shared_ptr());
-  }
-  // Remove the output if it was donated to by an input.
-  if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
-    buffers.erase(it);
-  }
-  encoder.add_completed_handler([buffers = std::move(buffers)]() {});
-  encoder.maybe_commit();
+  encoder.end_encoding();
 }

 void finalize(Stream s) {
@@ -60,7 +62,7 @@ void finalize(Stream s) {

 void synchronize(Stream s) {
  nvtx3::scoped_range r("gpu::synchronize");
-  cu::get_command_encoder(s).synchronize();
+  cu::get_stream(s).synchronize();
 }

 } // namespace mlx::core::gpu
--- a/mlx/backend/cuda/event.cu
+++ b/mlx/backend/cuda/event.cu
@@ -1,6 +1,5 @@
 // Copyright © 2024 Apple Inc.

-#include "mlx/backend/cuda/allocator.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/event.h"
 #include "mlx/backend/cuda/utils.h"
@@ -61,9 +60,7 @@ void CudaEvent::wait(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    scheduler::enqueue(s, [*this]() mutable { wait(); });
  } else {
-    auto& enc = cu::get_command_encoder(s);
-    enc.commit();
-    wait(enc.stream());
+    wait(cu::get_stream(s).last_cuda_stream());
  }
 }

@@ -76,9 +73,7 @@ void CudaEvent::record(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    throw std::runtime_error("CudaEvent can not wait on cpu stream.");
  } else {
-    auto& enc = cu::get_command_encoder(s);
-    enc.commit();
-    record(enc.stream());
+    record(cu::get_stream(s).last_cuda_stream());
  }
 }

@@ -90,6 +85,8 @@ bool CudaEvent::completed() const {
 // SharedEvent implementations
 ///////////////////////////////////////////////////////////////////////////////

+namespace {
+
 __host__ __device__ void event_wait(SharedEvent::Atomic* ac, uint64_t value) {
  uint64_t current;
  while ((current = ac->load()) < value) {
@@ -110,14 +107,16 @@ __global__ void event_signal_kernel(SharedEvent::Atomic* ac, uint64_t value) {
  event_signal(ac, value);
 }

+} // namespace
+
 SharedEvent::SharedEvent() {
  // Allocate cuda::atomic on managed memory.
-  Atomic* ac;
-  CHECK_CUDA_ERROR(cudaMallocManaged(&ac, sizeof(Atomic)));
+  allocator::Buffer buffer = allocator::malloc(sizeof(Atomic));
+  Atomic* ac = static_cast<Atomic*>(buffer.raw_ptr());
  new (ac) Atomic(0);
-  ac_ = std::shared_ptr<Atomic>(ac, [](Atomic* ptr) {
+  ac_ = std::shared_ptr<Atomic>(ac, [buffer](Atomic* ptr) {
    ptr->~Atomic();
-    allocator().cuda_free(ptr);
+    allocator::free(buffer);
  });
 }

@@ -136,9 +135,11 @@ void SharedEvent::wait(Stream s, uint64_t value) {
    scheduler::enqueue(s, [*this, value]() mutable { wait(value); });
  } else {
    auto& encoder = get_command_encoder(s);
-    encoder.commit();
-    wait(encoder.stream(), value);
+    encoder.launch_kernel(
+        encoder.stream().last_cuda_stream(),
+        [this, value](cudaStream_t stream) { wait(stream, value); });
    encoder.add_completed_handler([ac = ac_]() {});
+    encoder.end_encoding();
  }
 }

@@ -154,15 +155,14 @@ void SharedEvent::signal(cudaStream_t stream, uint64_t value) {
 void SharedEvent::signal(Stream s, uint64_t value) {
  nvtx3::scoped_range r("cu::SharedEvent::signal(s)");
  if (s.device == mlx::core::Device::cpu) {
-    // Signal through a GPU stream so the atomic is updated in GPU - updating
-    // the atomic in CPU sometimes does not get GPU notified.
-    static CudaStream stream(device(mlx::core::Device::gpu));
-    scheduler::enqueue(s, [*this, value]() mutable { signal(stream, value); });
+    scheduler::enqueue(s, [*this, value]() mutable { signal(value); });
  } else {
    auto& encoder = get_command_encoder(s);
-    encoder.commit();
-    signal(encoder.stream(), value);
+    encoder.launch_kernel(
+        encoder.stream().last_cuda_stream(),
+        [this, value](cudaStream_t stream) { signal(stream, value); });
    encoder.add_completed_handler([ac = ac_]() {});
+    encoder.end_encoding();
  }
 }

--- a/mlx/backend/cuda/fence.cpp
+++ b/mlx/backend/cuda/fence.cpp
@@ -1,29 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/fence.h"
-#include "mlx/backend/cuda/event.h"
-
-namespace mlx::core {
-
-struct FenceImpl {
-  uint32_t count;
-  cu::SharedEvent event;
-};
-
-Fence::Fence(Stream s) {
-  fence_ = std::shared_ptr<void>(
-      new FenceImpl{0}, [](void* ptr) { delete static_cast<FenceImpl*>(ptr); });
-}
-
-void Fence::wait(Stream s, const array&) {
-  auto* fence = static_cast<FenceImpl*>(fence_.get());
-  fence->event.wait(fence->count);
-}
-
-void Fence::update(Stream s, const array&) {
-  auto* fence = static_cast<FenceImpl*>(fence_.get());
-  fence->count++;
-  fence->event.signal(s, fence->count);
-}
-
-} // namespace mlx::core
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Angelos Katharopoulos	83762691ba	Fix four step fft	2025-05-08 14:14:59 -07:00
Angelos Katharopoulos	2a41caa00e	Add single kernel bluestein	2025-05-08 13:23:11 -07:00
Angelos Katharopoulos	6593281d25	Refactored four-step	2025-05-08 13:23:11 -07:00
Angelos Katharopoulos	da98e8bce8	Refactored stockham	2025-05-08 13:23:11 -07:00
Angelos Katharopoulos	be57a16a80	More tmp fft changes	2025-05-08 13:23:11 -07:00
Angelos Katharopoulos	1704809f29	Tmp FFT commit	2025-05-08 13:23:11 -07:00