Improve the ring backend initialization

[CUDA] Implement Scan kernel (#2347 )
* Contiguous scan * Strided scan * Enable tests * Fix failing logaddexp test * Use cexpf in Metal
2025-09-05 16:13:52 +08:00 · 2025-07-11 15:31:28 -07:00 · 2025-07-10 16:54:12 -07:00 · 2025-07-10 16:28:50 -07:00 · 2025-07-10 11:14:42 -07:00 · 2025-07-10 07:24:21 -07:00
331 changed files with 24538 additions and 5310 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -16,6 +16,9 @@ parameters:
  linux_release:
    type: boolean
    default: false
+  cuda_release:
+    type: boolean
+    default: false

 jobs:
  build_documentation:
@@ -38,7 +41,7 @@ jobs:
            pip install --upgrade pip
            pip install --upgrade cmake
            pip install -r docs/requirements.txt
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install . -v
+            pip install . -v
      - when:
          condition:
            not: << parameters.upload-docs >>
@@ -94,17 +97,15 @@ jobs:
          name: Install Python package
          command: |
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              python3 setup.py build_ext --inplace
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              python3 setup.py develop
      - run:
          name: Generate package stubs
          command: |
            echo "stubs"
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Run Python tests
          command: |
@@ -154,15 +155,14 @@ jobs:
          name: Install Python package
          command: |
            source env/bin/activate
-            DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-            CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
              pip install -e . -v
      - run:
          name: Generate package stubs
          command: |
            source env/bin/activate
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Run Python tests
          command: |
@@ -205,13 +205,34 @@ jobs:
          name: Run Python tests with JIT
          command: |
            source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
              pip install -e . -v
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
              METAL_DEBUG_ERROR_MODE=0 \
              python -m xmlrunner discover -v python/tests -o test-results/gpu_jit

+  cuda_build_and_test:
+    machine:
+      image: linux-cuda-12:default
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - run:
+          name: Install Python package
+          command: |
+            sudo apt-get update
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            python -m venv env
+            source env/bin/activate
+            CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              pip install -e ".[dev]"
+      - run:
+          name: Run Python tests
+          command: |
+            source env/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
+            LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
+
  build_release:
    parameters:
      python_version:
@@ -252,21 +273,18 @@ jobs:
          command: |
            source env/bin/activate
            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
              pip install . -v
      - run:
          name: Generate package stubs
          command: |
            source env/bin/activate
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Build Python package
          command: |
            source env/bin/activate
-            << parameters.build_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              python -m build -w
+            << parameters.build_env >> python -m build -w
      - when:
          condition: << parameters.build_env >>
          steps:
@@ -314,14 +332,10 @@ jobs:
            pip install patchelf
            pip install build
            pip install twine
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              pip install . -v
+            << parameters.extra_env >> pip install . -v
            pip install typing_extensions
-            python setup.py generate_stubs 
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python -m build --wheel
+            python setup.py generate_stubs
+            << parameters.extra_env >> python -m build --wheel
            auditwheel show dist/*
            auditwheel repair dist/* --plat manylinux_2_31_x86_64
      - run:
@@ -332,6 +346,46 @@ jobs:
      - store_artifacts:
          path: wheelhouse/

+  build_cuda_release:
+    parameters:
+      python_version:
+        type: string
+        default: "3.9"
+      extra_env:
+        type: string
+        default: "DEV_RELEASE=1"
+    machine:
+      image: linux-cuda-12:default
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - run:
+          name: Build wheel
+          command: |
+            sudo apt-get update
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            python -m venv env
+            source env/bin/activate
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            << parameters.extra_env >> \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              pip install ".[dev]" -v
+            python setup.py generate_stubs
+            << parameters.extra_env >> \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              python -m build --wheel
+            bash python/scripts/repair_cuda.sh
+      - run:
+          name: Upload package
+          command: |
+            source env/bin/activate
+            twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+
 workflows:
  build_and_test:
    when:
@@ -348,6 +402,7 @@ workflows:
            parameters:
              macosx_deployment_target: ["13.5", "14.0"]
      - linux_build_and_test
+      - cuda_build_and_test 
      - build_documentation 

  build_pypi_release:
@@ -437,6 +492,16 @@ workflows:
            branches:
              ignore: /.*/
          upload-docs: true
+      - build_linux_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              extra_env: ["PYPI_RELEASE=1"]

  prb:
    when:
@@ -455,6 +520,8 @@ workflows:
              macosx_deployment_target: ["13.5", "14.0"]
      - linux_build_and_test:
          requires: [ hold ]
+      - cuda_build_and_test:
+          requires: [ hold ]
  nightly_build:
    when:
      and:
@@ -598,3 +665,14 @@ workflows:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
              extra_env: ["PYPI_RELEASE=1"]
+  cuda_test_release:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.cuda_release >>
+    jobs:
+      - build_cuda_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              extra_env: ["PYPI_RELEASE=1"]
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+uv.lock

 # vim
 *.swp
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,6 +34,7 @@ option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
 option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
 option(MLX_BUILD_CPU "Build cpu backend" ON)
+option(MLX_BUILD_CUDA "Build cuda backend" OFF)
 option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
 option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
@@ -83,6 +84,10 @@ if(MLX_BUILD_METAL)
  set(QUARTZ_LIB "-framework QuartzCore")
 endif()

+if(MLX_BUILD_CUDA)
+  enable_language(CUDA)
+endif()
+
 if(MLX_BUILD_METAL AND NOT METAL_LIB)
  message(STATUS "Metal not found. Unable to build GPU")
  set(MLX_BUILD_METAL OFF)
@@ -226,6 +231,9 @@ target_include_directories(
  mlx PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
             $<INSTALL_INTERFACE:include>)

+# Do not add mlx_EXPORTS define for shared library.
+set_target_properties(mlx PROPERTIES DEFINE_SYMBOL "")
+
 FetchContent_Declare(
  fmt
  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,6 @@
 include CMakeLists.txt
+include mlx.pc.in
 recursive-include mlx/ *
+include cmake/*
 include python/src/*
 include python/mlx/py.typed # support type hinting as in PEP-561
--- a/benchmarks/cpp/irregular_strides.cpp
+++ b/benchmarks/cpp/irregular_strides.cpp
@@ -1,5 +1,6 @@
 // Copyright © 2023 Apple Inc.

+#include <cstring>
 #include <iostream>
 #include <sstream>

--- a/benchmarks/cpp/single_ops.cpp
+++ b/benchmarks/cpp/single_ops.cpp
@@ -192,6 +192,22 @@ void time_reductions() {

  auto argmin_along_1 = [&a]() { return mx::argmin(a, 1, false); };
  TIME(argmin_along_1);
+
+  auto indices = mx::array({1});
+  auto updates = mx::reshape(mx::array({NAN}), {1, 1, 1});
+  std::vector<int> axes{0};
+  auto b = scatter(a, {indices}, updates, axes);
+  mx::eval(b);
+
+  auto max_along_0 = [&b]() { return mx::max(b, 0, false); };
+  TIME(max_along_0);
+  auto max_along_1 = [&b]() { return mx::max(b, 1, false); };
+  TIME(max_along_1);
+
+  auto min_along_0 = [&b]() { return mx::min(b, 0, false); };
+  TIME(min_along_0);
+  auto min_along_1 = [&b]() { return mx::min(b, 1, false); };
+  TIME(min_along_1);
 }

 void time_gather_scatter() {
--- a/benchmarks/python/comparative/bench_torch.py
+++ b/benchmarks/python/comparative/bench_torch.py
@@ -5,6 +5,7 @@ import os
 import time

 import torch
+import torch.cuda
 import torch.mps


@@ -44,8 +45,10 @@ def bench(f, *args):


 def sync_if_needed(x):
-    if x.device != torch.device("cpu"):
+    if x.device == torch.device("mps"):
        torch.mps.synchronize()
+    elif x.device == torch.device("cuda"):
+        torch.cuda.synchronize()


@torch.no_grad()
@@ -99,6 +102,14 @@ def reduction(op, axis, x):
    sync_if_needed(x)


+@torch.no_grad()
+def sum_and_add(axis, x, y):
+    z = x.sum(axis=axis, keepdims=True)
+    for i in range(50):
+        z = (z + y).sum(axis=axis, keepdims=True)
+    sync_if_needed(x)
+
+
@torch.no_grad()
 def softmax(axis, x):
    ys = []
@@ -340,7 +351,11 @@ if __name__ == "__main__":
        args.axis.pop(0)

    torch.set_num_threads(1)
-    device = "cpu" if args.cpu else "mps"
+    device = "mps"
+    if torch.cuda.is_available():
+        device = "cuda"
+    if args.cpu:
+        device = "cpu"

    types = args.dtype
    if not types:
@@ -460,5 +475,8 @@ if __name__ == "__main__":
    elif args.benchmark == "selu":
        print(bench(selu, x))

+    elif args.benchmark == "sum_and_add":
+        print(bench(sum_and_add, axis, *xs))
+
    else:
        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")
--- a/benchmarks/python/conv_unaligned_bench.py
+++ b/benchmarks/python/conv_unaligned_bench.py
@@ -0,0 +1,107 @@
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_2D
+
+
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+
+    return pt_conv_2D
+
+
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("mps")
+
+    torch.mps.synchronize()
+
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    dtype = "float32"
+    shapes = (
+        (4, 32, 32, 21, 3, 3, 128),
+        (4, 32, 32, 21, 3, 3, 37),
+        (4, 32, 32, 370, 3, 3, 370),
+        (4, 32, 32, 370, 7, 7, 128),
+        (2, 320, 640, 21, 7, 7, 21),
+    )
+    for N, H, W, C, kh, kw, O in shapes:
+        time_mlx, time_torch = bench_shape(
+            N, H, W, C, kh, kw, O, (1, 1), (0, 0), 1, dtype
+        )
+        diff = time_torch / time_mlx - 1.0
+
+        print(
+            f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kh:2d}, {kw:2d}, {C:3d}), {dtype}, {100. * diff:+5.2f}%"
+        )
+        if time_mlx >= 2.0 * time_torch:
+            print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/gather_mm_bench.py
+++ b/benchmarks/python/gather_mm_bench.py
@@ -0,0 +1,74 @@
+# Copyright © 2025 Apple Inc.
+
+import mlx.core as mx
+from time_utils import time_fn
+
+N = 1024
+D = 1024
+M = 1024
+E = 32
+I = 4
+
+
+def gather_sort(x, indices):
+    N, M = indices.shape
+    indices = indices.flatten()
+    order = mx.argsort(indices)
+    inv_order = mx.argsort(order)
+    return x.flatten(0, -3)[order // M], indices[order], inv_order
+
+
+def scatter_unsort(x, inv_order, shape=None):
+    x = x[inv_order]
+    if shape is not None:
+        x = mx.unflatten(x, 0, shape)
+    return x
+
+
+def gather_mm_simulate(x, w, indices):
+    x, idx, inv_order = gather_sort(x, indices)
+    for i in range(2):
+        y = mx.concatenate([x[i] @ w[j].T for i, j in enumerate(idx.tolist())], axis=0)
+        x = y[:, None]
+    x = scatter_unsort(x, inv_order, indices.shape)
+    return x
+
+
+def time_gather_mm():
+    x = mx.random.normal((N, 1, 1, D)) / 1024**0.5
+    w1 = mx.random.normal((E, M, D)) / 1024**0.5
+    w2 = mx.random.normal((E, D, M)) / 1024**0.5
+    indices = (mx.random.uniform(shape=(N, I)) * E).astype(mx.uint32)
+    sorted_indices = mx.sort(indices.flatten()).reshape(N, I)
+    mx.eval(x, w1, w2, indices, sorted_indices)
+
+    def gather_mm(x, w1, w2, indices, sort):
+        idx = indices
+        inv_order = None
+        if sort:
+            x, idx, inv_order = gather_sort(x, indices)
+        x = mx.gather_mm(x, w1.swapaxes(-1, -2), rhs_indices=idx, sorted_indices=sort)
+        x = mx.gather_mm(x, w2.swapaxes(-1, -2), rhs_indices=idx, sorted_indices=sort)
+        if sort:
+            x = scatter_unsort(x, inv_order, indices.shape)
+        return x
+
+    time_fn(gather_mm, x, w1, w2, indices, False)
+    time_fn(gather_mm, x, w1, w2, sorted_indices, False)
+    time_fn(gather_mm, x, w1, w2, indices, True)
+
+    x = mx.random.normal((N * I, D)) / 1024**0.5
+    w1 = mx.random.normal((M, D)) / 1024**0.5
+    w2 = mx.random.normal((D, M)) / 1024**0.5
+    mx.eval(x, w1, w2)
+
+    def equivalent_matmul(x, w1, w2):
+        x = x @ w1.T
+        x = x @ w2.T
+        return x
+
+    time_fn(equivalent_matmul, x, w1, w2)
+
+
+if __name__ == "__main__":
+    time_gather_mm()
--- a/benchmarks/python/gather_qmm_bench.py
+++ b/benchmarks/python/gather_qmm_bench.py
@@ -0,0 +1,84 @@
+# Copyright © 2025 Apple Inc.
+
+import mlx.core as mx
+from time_utils import time_fn
+
+N = 1024
+D = 1024
+M = 1024
+E = 32
+I = 4
+
+
+def gather_sort(x, indices):
+    N, M = indices.shape
+    indices = indices.flatten()
+    order = mx.argsort(indices)
+    inv_order = mx.argsort(order)
+    return x.flatten(0, -3)[order // M], indices[order], inv_order
+
+
+def scatter_unsort(x, inv_order, shape=None):
+    x = x[inv_order]
+    if shape is not None:
+        x = mx.unflatten(x, 0, shape)
+    return x
+
+
+def gather_mm_simulate(x, w, indices):
+    x, idx, inv_order = gather_sort(x, indices)
+    for i in range(2):
+        y = mx.concatenate(
+            [
+                mx.quantized_matmul(x[i], w[0][j], w[1][j], w[2][j], transpose=True)
+                for i, j in enumerate(idx.tolist())
+            ],
+            axis=0,
+        )
+        x = y[:, None]
+    x = scatter_unsort(x, inv_order, indices.shape)
+    return x
+
+
+def time_gather_qmm():
+    x = mx.random.normal((N, 1, 1, D)) / 1024**0.5
+    w1 = mx.random.normal((E, M, D)) / 1024**0.5
+    w2 = mx.random.normal((E, D, M)) / 1024**0.5
+    w1 = mx.quantize(w1)
+    w2 = mx.quantize(w2)
+    indices = (mx.random.uniform(shape=(N, I)) * E).astype(mx.uint32)
+    sorted_indices = mx.sort(indices.flatten()).reshape(N, I)
+    mx.eval(x, w1, w2, indices, sorted_indices)
+
+    def gather_mm(x, w1, w2, indices, sort):
+        idx = indices
+        inv_order = None
+        if sort:
+            x, idx, inv_order = gather_sort(x, indices)
+        x = mx.gather_qmm(x, *w1, transpose=True, rhs_indices=idx, sorted_indices=sort)
+        x = mx.gather_qmm(x, *w2, transpose=True, rhs_indices=idx, sorted_indices=sort)
+        if sort:
+            x = scatter_unsort(x, inv_order, indices.shape)
+        return x
+
+    time_fn(gather_mm, x, w1, w2, indices, False)
+    time_fn(gather_mm, x, w1, w2, sorted_indices, False)
+    time_fn(gather_mm, x, w1, w2, indices, True)
+
+    x = mx.random.normal((N * I, D)) / 1024**0.5
+    w1 = mx.random.normal((M, D)) / 1024**0.5
+    w2 = mx.random.normal((D, M)) / 1024**0.5
+    w1 = mx.quantize(w1)
+    w2 = mx.quantize(w2)
+    mx.eval(x, w1, w2)
+
+    def equivalent_matmul(x, w1, w2):
+        x = mx.quantized_matmul(x, *w1, transpose=True)
+        x = mx.quantized_matmul(x, *w2, transpose=True)
+        return x
+
+    time_fn(equivalent_matmul, x, w1, w2)
+
+
+if __name__ == "__main__":
+    time_gather_qmm()
--- a/benchmarks/python/layer_norm_bench.py
+++ b/benchmarks/python/layer_norm_bench.py
@@ -1,5 +1,7 @@
 # Copyright © 2023-2024 Apple Inc.

+from functools import partial
+
 import mlx.core as mx
 import mlx.nn as nn
 from time_utils import time_fn
@@ -18,51 +20,63 @@ def layer_norm(x, w, b, eps):
    return y


-def time_layer_norm():
+def time_layer_norm(N, dt):
+    L = 1024
    f1 = lambda x, w, b, y: (layer_norm(x, w, b, 1e-5) * y).sum()
    f2 = lambda x, w, b, y: (mx.fast.layer_norm(x, w, b, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0, 1, 2))
    g2 = mx.grad(f2, argnums=(0, 1, 2))

-    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
-    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    w = mx.random.uniform(shape=(N,)).astype(dt)
+    b = mx.random.uniform(shape=(N,)).astype(dt)
+    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
    mx.eval(x, w, b, y)

-    def layer_norm_loop(g, x, w, b):
+    def layer_norm_loop(f, x, w, b):
+        for _ in range(32):
+            x = f(x, w, b)
+        return x
+
+    time_fn(layer_norm_loop, partial(layer_norm, eps=1e-5), x, w, b)
+    time_fn(layer_norm_loop, partial(mx.fast.layer_norm, eps=1e-5), x, w, b)
+
+    def layer_norm_grad_loop(g, x, w, b):
        gx, gw, gb = x, w, b
        for _ in range(32):
            gx, gw, gb = g(gx, gw, gb, y)
        return gx, gw, gb

-    time_fn(layer_norm_loop, g1, x, w, b)
-    time_fn(layer_norm_loop, g2, x, w, b)
-    time_fn(layer_norm_loop, mx.compile(g1), x, w, b)
-    time_fn(layer_norm_loop, mx.compile(g2), x, w, b)
+    time_fn(layer_norm_grad_loop, g1, x, w, b)
+    time_fn(layer_norm_grad_loop, g2, x, w, b)
+    time_fn(layer_norm_grad_loop, mx.compile(g1), x, w, b)
+    time_fn(layer_norm_grad_loop, mx.compile(g2), x, w, b)

    f1 = lambda x, y: (layer_norm(x, None, None, 1e-5) * y).sum()
    f2 = lambda x, y: (mx.fast.layer_norm(x, None, None, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0,))
    g2 = mx.grad(f2, argnums=(0,))

-    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
-    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    w = mx.random.uniform(shape=(N,)).astype(dt)
+    b = mx.random.uniform(shape=(N,)).astype(dt)
+    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
    mx.eval(x, w, b, y)

-    def layer_norm_loop(g, x):
+    def layer_norm_grad_x_loop(g, x):
        gx = x
        for _ in range(32):
            gx = g(gx, y)
        return gx

-    time_fn(layer_norm_loop, g1, x)
-    time_fn(layer_norm_loop, g2, x)
-    time_fn(layer_norm_loop, mx.compile(g1), x)
-    time_fn(layer_norm_loop, mx.compile(g2), x)
+    time_fn(layer_norm_grad_x_loop, g1, x)
+    time_fn(layer_norm_grad_x_loop, g2, x)
+    time_fn(layer_norm_grad_x_loop, mx.compile(g1), x)
+    time_fn(layer_norm_grad_x_loop, mx.compile(g2), x)


 if __name__ == "__main__":
-    time_layer_norm()
+    for dt in [mx.float32, mx.float16, mx.bfloat16]:
+        for n in [1024, 2048, 4096, 8192, 8192 + 1024]:
+            print(dt, n)
+            time_layer_norm(n, dt)
--- a/benchmarks/python/single_ops.py
+++ b/benchmarks/python/single_ops.py
@@ -51,6 +51,20 @@ def time_maximum():
    time_fn(mx.maximum, a, b)


+def time_max():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    a[1, 1] = mx.nan
+    mx.eval(a)
+    time_fn(mx.max, a, 0)
+
+
+def time_min():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    a[1, 1] = mx.nan
+    mx.eval(a)
+    time_fn(mx.min, a, 0)
+
+
 def time_negative():
    a = mx.random.uniform(shape=(10000, 1000))
    mx.eval(a)
@@ -108,6 +122,8 @@ if __name__ == "__main__":

    time_add()
    time_matmul()
+    time_min()
+    time_max()
    time_maximum()
    time_exp()
    time_negative()
--- a/cmake/extension.cmake
+++ b/cmake/extension.cmake
@@ -11,13 +11,14 @@ include(CMakeParseArguments)
 # Args: TARGET: Custom target to be added for the metal library TITLE: Name of
 # the .metallib OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib SOURCES: List
 # of source files INCLUDE_DIRS: List of include dirs DEPS: List of dependency
-# files (like headers)
+# files (like headers) DEBUG: Boolean, if true, enables debug compile options
+# for this specific library. If not provided, uses global MLX_METAL_DEBUG.
 #
 # clang format on

 macro(mlx_build_metallib)
  # Parse args
-  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY)
+  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY DEBUG)
  set(multiValueArgs SOURCES INCLUDE_DIRS DEPS)
  cmake_parse_arguments(MTLLIB "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -26,6 +27,10 @@ macro(mlx_build_metallib)

  # Collect compile options
  set(MTLLIB_COMPILE_OPTIONS -Wall -Wextra -fno-fast-math -Wno-c++17-extensions)
+  if(MLX_METAL_DEBUG OR MTLLIB_DEBUG)
+    set(MTLLIB_COMPILE_OPTIONS ${MTLLIB_COMPILE_OPTIONS} -gline-tables-only
+                               -frecord-sources)
+  endif()

  # Prepare metallib build command
  add_custom_command(
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -10,7 +10,7 @@ import mlx.core as mx
 # -- Project information -----------------------------------------------------

 project = "MLX"
-copyright = "2023, MLX Contributors"
+copyright = "2023, Apple"
 author = "MLX Contributors"
 version = ".".join(mx.__version__.split(".")[:3])
 release = version
--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@@ -8,23 +8,26 @@ MLX supports writing custom Metal kernels through the Python and C++ APIs.
 Simple Example
 --------------

+.. currentmodule:: mlx.core
+
 Let's write a custom kernel that computes ``exp`` elementwise:

 .. code-block:: python

-  def exp_elementwise(a: mx.array):
-      source = """
-          uint elem = thread_position_in_grid.x;
-          T tmp = inp[elem];
-          out[elem] = metal::exp(tmp);
-      """
+  source = """
+      uint elem = thread_position_in_grid.x;
+      T tmp = inp[elem];
+      out[elem] = metal::exp(tmp);
+  """

-      kernel = mx.fast.metal_kernel(
-          name="myexp",
-          input_names=["inp"],
-          output_names=["out"],
-          source=source,
-      )
+  kernel = mx.fast.metal_kernel(
+      name="myexp",
+      input_names=["inp"],
+      output_names=["out"],
+      source=source,
+  )
+
+  def exp_elementwise(a: mx.array):
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@@ -39,8 +42,13 @@ Let's write a custom kernel that computes ``exp`` elementwise:
  b = exp_elementwise(a)
  assert mx.allclose(b, mx.exp(a))

+Every time you make a kernel, a new Metal library is created and possibly
+JIT compiled. To reduce the overhead from that, build the kernel once with
+:func:`fast.metal_kernel` and then use it many times.
+
 .. note::
-    We are only required to pass the body of the Metal kernel in ``source``.
+   Only pass the body of the Metal kernel in ``source``. The function
+   signature is generated automatically.

 The full function signature will be generated using:

@@ -78,44 +86,51 @@ Putting this all together, the generated function signature for ``myexp`` is as

  template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;

-Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads <https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_ function.
-This means we will launch ``mx.prod(grid)`` threads, subdivided into ``threadgroup`` size threadgroups.
-For optimal performance, each thread group dimension should be less than or equal to the corresponding grid dimension.
+Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads
+<https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_
+function. This means we will launch ``mx.prod(grid)`` threads, subdivided into
+``threadgroup`` size threadgroups.  For optimal performance, each thread group
+dimension should be less than or equal to the corresponding grid dimension.

-Passing ``verbose=True`` to ``mx.fast.metal_kernel.__call__`` will print the generated code for debugging purposes.
+Passing ``verbose=True`` to :func:`ast.metal_kernel.__call__` will print the
+generated code for debugging purposes.

 Using Shape/Strides
 -------------------

-``mx.fast.metal_kernel`` supports an argument ``ensure_row_contiguous`` which is ``True`` by default.
-This will copy the ``mx.array`` inputs if needed before the kernel is launched to ensure that the memory layout is row contiguous.
-Generally this makes writing the kernel easier, since we don't have to worry about gaps or the ordering of the dims
-when indexing.
+:func:`fast.metal_kernel` supports an argument ``ensure_row_contiguous`` which
+is ``True`` by default. This will copy the array inputs if needed
+before the kernel is launched to ensure that the memory layout is row
+contiguous.  Generally this makes writing the kernel easier, since we don't
+have to worry about gaps or the ordering of the dims when indexing.

-If we want to avoid this copy, ``metal_kernel`` automatically passes ``a_shape``, ``a_strides`` and ``a_ndim`` for each
-input array ``a`` if any are present in ``source``.
-We can then use MLX's built in indexing utils to fetch the right elements for each thread.
+If we want to avoid this copy, :func:`fast.metal_kernel` automatically passes
+``a_shape``, ``a_strides`` and ``a_ndim`` for each input array ``a`` if any are
+present in ``source``. We can then use MLX's built in indexing utils to fetch
+the right elements for each thread.

-Let's convert ``myexp`` above to support arbitrarily strided arrays without relying on a copy from ``ensure_row_contiguous``:
+Let's convert ``myexp`` above to support arbitrarily strided arrays without
+relying on a copy from ``ensure_row_contiguous``:

 .. code-block:: python
+   
+  source = """
+      uint elem = thread_position_in_grid.x;
+      // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
+      uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
+      T tmp = inp[loc];
+      // Output arrays are always row contiguous
+      out[elem] = metal::exp(tmp);
+  """
+
+  kernel = mx.fast.metal_kernel(
+      name="myexp_strided",
+      input_names=["inp"],
+      output_names=["out"],
+      source=source
+  )

  def exp_elementwise(a: mx.array):
-      source = """
-          uint elem = thread_position_in_grid.x;
-          // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
-          uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
-          T tmp = inp[loc];
-          // Output arrays are always row contiguous
-          out[elem] = metal::exp(tmp);
-      """
-
-      kernel = mx.fast.metal_kernel(
-          name="myexp_strided",
-          input_names=["inp"],
-          output_names=["out"],
-          source=source
-      )
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@@ -142,137 +157,139 @@ We'll start with the following MLX implementation using standard ops:

 .. code-block:: python

-    def grid_sample_ref(x, grid):
-        N, H_in, W_in, _ = x.shape
-        ix = ((grid[..., 0] + 1) * W_in - 1) / 2
-        iy = ((grid[..., 1] + 1) * H_in - 1) / 2
+  def grid_sample_ref(x, grid):
+      N, H_in, W_in, _ = x.shape
+      ix = ((grid[..., 0] + 1) * W_in - 1) / 2
+      iy = ((grid[..., 1] + 1) * H_in - 1) / 2

-        ix_nw = mx.floor(ix).astype(mx.int32)
-        iy_nw = mx.floor(iy).astype(mx.int32)
+      ix_nw = mx.floor(ix).astype(mx.int32)
+      iy_nw = mx.floor(iy).astype(mx.int32)

-        ix_ne = ix_nw + 1
-        iy_ne = iy_nw
+      ix_ne = ix_nw + 1
+      iy_ne = iy_nw

-        ix_sw = ix_nw
-        iy_sw = iy_nw + 1
+      ix_sw = ix_nw
+      iy_sw = iy_nw + 1

-        ix_se = ix_nw + 1
-        iy_se = iy_nw + 1
+      ix_se = ix_nw + 1
+      iy_se = iy_nw + 1

-        nw = (ix_se - ix)    * (iy_se - iy)
-        ne = (ix    - ix_sw) * (iy_sw - iy)
-        sw = (ix_ne - ix)    * (iy    - iy_ne)
-        se = (ix    - ix_nw) * (iy    - iy_nw)
+      nw = (ix_se - ix)    * (iy_se - iy)
+      ne = (ix    - ix_sw) * (iy_sw - iy)
+      sw = (ix_ne - ix)    * (iy    - iy_ne)
+      se = (ix    - ix_nw) * (iy    - iy_nw)

-        I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
-        I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
-        I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
-        I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]
+      I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
+      I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
+      I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
+      I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]

-        mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
-        mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
-        mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
-        mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)
+      mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
+      mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
+      mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
+      mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)

-        I_nw *= mask_nw[..., None]
-        I_ne *= mask_ne[..., None]
-        I_sw *= mask_sw[..., None]
-        I_se *= mask_se[..., None]
+      I_nw *= mask_nw[..., None]
+      I_ne *= mask_ne[..., None]
+      I_sw *= mask_sw[..., None]
+      I_se *= mask_se[..., None]

-        output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se
+      output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se

-        return output
+      return output

-Now let's use ``mx.custom_function`` together with ``mx.fast.metal_kernel``
+Now let's use :func:`custom_function` together with :func:`fast.metal_kernel`
 to write a fast GPU kernel for both the forward and backward passes.

 First we'll implement the forward pass as a fused kernel:

 .. code-block:: python

-    @mx.custom_function
-    def grid_sample(x, grid):
+  source = """
+      uint elem = thread_position_in_grid.x;
+      int H = x_shape[1];
+      int W = x_shape[2];
+      int C = x_shape[3];
+      int gH = grid_shape[1];
+      int gW = grid_shape[2];

-        assert x.ndim == 4, "`x` must be 4D."
-        assert grid.ndim == 4, "`grid` must be 4D."
+      int w_stride = C;
+      int h_stride = W * w_stride;
+      int b_stride = H * h_stride;

-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
-        out_shape = (B, gN, gM, C)
+      uint grid_idx = elem / C * 2;
+      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

-        assert D == 2, "Last dim of `grid` must be size 2."
+      int ix_nw = floor(ix);
+      int iy_nw = floor(iy);

-        source = """
-            uint elem = thread_position_in_grid.x;
-            int H = x_shape[1];
-            int W = x_shape[2];
-            int C = x_shape[3];
-            int gH = grid_shape[1];
-            int gW = grid_shape[2];
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;

-            int w_stride = C;
-            int h_stride = W * w_stride;
-            int b_stride = H * h_stride;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;

-            uint grid_idx = elem / C * 2;
-            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;

-            int ix_nw = floor(ix);
-            int iy_nw = floor(iy);
+      T nw = (ix_se - ix)    * (iy_se - iy);
+      T ne = (ix    - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix)    * (iy    - iy_ne);
+      T se = (ix    - ix_nw) * (iy    - iy_nw);

-            int ix_ne = ix_nw + 1;
-            int iy_ne = iy_nw;
+      int batch_idx = elem / C / gH / gW * b_stride;
+      int channel_idx = elem % C;
+      int base_idx = batch_idx + channel_idx;

-            int ix_sw = ix_nw;
-            int iy_sw = iy_nw + 1;
+      T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
+      T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
+      T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
+      T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];

-            int ix_se = ix_nw + 1;
-            int iy_se = iy_nw + 1;
+      I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
+      I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
+      I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
+      I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;

-            T nw = (ix_se - ix)    * (iy_se - iy);
-            T ne = (ix    - ix_sw) * (iy_sw - iy);
-            T sw = (ix_ne - ix)    * (iy    - iy_ne);
-            T se = (ix    - ix_nw) * (iy    - iy_nw);
+      out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
+  """

-            int batch_idx = elem / C / gH / gW * b_stride;
-            int channel_idx = elem % C;
-            int base_idx = batch_idx + channel_idx;
+  kernel = mx.fast.metal_kernel(
+      name="grid_sample",
+      input_names=["x", "grid"],
+      output_names=["out"],
+      source=source,
+  )

-            T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
-            T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
-            T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
-            T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];
+  @mx.custom_function
+  def grid_sample(x, grid):

-            I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
-            I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
-            I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
-            I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;
+      assert x.ndim == 4, "`x` must be 4D."
+      assert grid.ndim == 4, "`grid` must be 4D."

-            out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
-        """
-        kernel = mx.fast.metal_kernel(
-            name="grid_sample",
-            input_names=["x", "grid"],
-            output_names=["out"],
-            source=source,
-        )
-        outputs = kernel(
-            inputs=[x, grid],
-            template=[("T", x.dtype)],
-            output_shapes=[out_shape],
-            output_dtypes=[x.dtype],
-            grid=(np.prod(out_shape), 1, 1),
-            threadgroup=(256, 1, 1),
-        )
-        return outputs[0]
+      B, _, _, C = x.shape
+      _, gN, gM, D = grid.shape
+      out_shape = (B, gN, gM, C)
+
+      assert D == 2, "Last dim of `grid` must be size 2."
+
+      outputs = kernel(
+          inputs=[x, grid],
+          template=[("T", x.dtype)],
+          output_shapes=[out_shape],
+          output_dtypes=[x.dtype],
+          grid=(np.prod(out_shape), 1, 1),
+          threadgroup=(256, 1, 1),
+      )
+      return outputs[0]

 For a reasonably sized input such as:

 .. code-block:: python

-    x.shape = (8, 1024, 1024, 64)
-    grid.shape = (8, 256, 256, 2)
+  x.shape = (8, 1024, 1024, 64)
+  grid.shape = (8, 256, 256, 2)

 On an M1 Max, we see a big performance improvement:

@@ -281,11 +298,11 @@ On an M1 Max, we see a big performance improvement:
 Grid Sample VJP
 ---------------

-Since we decorated ``grid_sample`` with ``mx.custom_function``, we can now define
-its custom vjp transform so MLX can differentiate it.
+Since we decorated ``grid_sample`` with :func:`custom_function`, we can now
+define its custom vjp transform so MLX can differentiate it.

 The backwards pass requires atomically updating ``x_grad``/``grid_grad`` and so
-requires a few extra ``mx.fast.metal_kernel`` features:
+requires a few extra :func:`fast.metal_kernel` features:

 * ``init_value=0``
    Initialize all of the kernel's outputs to this value before it runs. This allows us to update only part of the output arrays with the kernel.
@@ -299,128 +316,129 @@ We can then implement the backwards pass as follows:

 .. code-block:: python

-    @grid_sample.vjp
-    def grid_sample_vjp(primals, cotangent, _):
-        x, grid = primals
-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
+  source = """
+      uint elem = thread_position_in_grid.x;
+      int H = x_shape[1];
+      int W = x_shape[2];
+      int C = x_shape[3];
+      // Pad C to the nearest larger simdgroup size multiple
+      int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;

-        assert D == 2, "Last dim of `grid` must be size 2."
+      int gH = grid_shape[1];
+      int gW = grid_shape[2];

-        source = """
-            uint elem = thread_position_in_grid.x;
-            int H = x_shape[1];
-            int W = x_shape[2];
-            int C = x_shape[3];
-            // Pad C to the nearest larger simdgroup size multiple
-            int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;
+      int w_stride = C;
+      int h_stride = W * w_stride;
+      int b_stride = H * h_stride;

-            int gH = grid_shape[1];
-            int gW = grid_shape[2];
+      uint grid_idx = elem / C_padded * 2;
+      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

-            int w_stride = C;
-            int h_stride = W * w_stride;
-            int b_stride = H * h_stride;
+      int ix_nw = floor(ix);
+      int iy_nw = floor(iy);

-            uint grid_idx = elem / C_padded * 2;
-            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;

-            int ix_nw = floor(ix);
-            int iy_nw = floor(iy);
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;

-            int ix_ne = ix_nw + 1;
-            int iy_ne = iy_nw;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;

-            int ix_sw = ix_nw;
-            int iy_sw = iy_nw + 1;
+      T nw = (ix_se - ix)    * (iy_se - iy);
+      T ne = (ix    - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix)    * (iy    - iy_ne);
+      T se = (ix    - ix_nw) * (iy    - iy_nw);

-            int ix_se = ix_nw + 1;
-            int iy_se = iy_nw + 1;
+      int batch_idx = elem / C_padded / gH / gW * b_stride;
+      int channel_idx = elem % C_padded;
+      int base_idx = batch_idx + channel_idx;

-            T nw = (ix_se - ix)    * (iy_se - iy);
-            T ne = (ix    - ix_sw) * (iy_sw - iy);
-            T sw = (ix_ne - ix)    * (iy    - iy_ne);
-            T se = (ix    - ix_nw) * (iy    - iy_nw);
+      T gix = T(0);
+      T giy = T(0);
+      if (channel_idx < C) {
+          int cot_index = elem / C_padded * C + channel_idx;
+          T cot = cotangent[cot_index];
+          if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
+              int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);

-            int batch_idx = elem / C_padded / gH / gW * b_stride;
-            int channel_idx = elem % C_padded;
-            int base_idx = batch_idx + channel_idx;
+              T I_nw = x[offset];
+              gix -= I_nw * (iy_se - iy) * cot;
+              giy -= I_nw * (ix_se - ix) * cot;
+          }
+          if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
+              int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);

-            T gix = T(0);
-            T giy = T(0);
-            if (channel_idx < C) {
-                int cot_index = elem / C_padded * C + channel_idx;
-                T cot = cotangent[cot_index];
-                if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
-                    int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);
+              T I_ne = x[offset];
+              gix += I_ne * (iy_sw - iy) * cot;
+              giy -= I_ne * (ix - ix_sw) * cot;
+          }
+          if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
+              int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);

-                    T I_nw = x[offset];
-                    gix -= I_nw * (iy_se - iy) * cot;
-                    giy -= I_nw * (ix_se - ix) * cot;
-                }
-                if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
-                    int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);
+              T I_sw = x[offset];
+              gix -= I_sw * (iy - iy_ne) * cot;
+              giy += I_sw * (ix_ne - ix) * cot;
+          }
+          if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
+              int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);

-                    T I_ne = x[offset];
-                    gix += I_ne * (iy_sw - iy) * cot;
-                    giy -= I_ne * (ix - ix_sw) * cot;
-                }
-                if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
-                    int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);
+              T I_se = x[offset];
+              gix += I_se * (iy - iy_nw) * cot;
+              giy += I_se * (ix - ix_nw) * cot;
+          }
+      }

-                    T I_sw = x[offset];
-                    gix -= I_sw * (iy - iy_ne) * cot;
-                    giy += I_sw * (ix_ne - ix) * cot;
-                }
-                if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
-                    int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);
+      T gix_mult = W / 2;
+      T giy_mult = H / 2;

-                    T I_se = x[offset];
-                    gix += I_se * (iy - iy_nw) * cot;
-                    giy += I_se * (ix - ix_nw) * cot;
-                }
-            }
+      // Reduce across each simdgroup first.
+      // This is much faster than relying purely on atomics.
+      gix = simd_sum(gix);
+      giy = simd_sum(giy);

-            T gix_mult = W / 2;
-            T giy_mult = H / 2;
+      if (thread_index_in_simdgroup == 0) {
+          atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
+          atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
+      }
+  """
+  kernel = mx.fast.metal_kernel(
+      name="grid_sample_grad",
+      input_names=["x", "grid", "cotangent"],
+      output_names=["x_grad", "grid_grad"],
+      source=source,
+      atomic_outputs=True,
+  )

-            // Reduce across each simdgroup first.
-            // This is much faster than relying purely on atomics.
-            gix = simd_sum(gix);
-            giy = simd_sum(giy);
+  @grid_sample.vjp
+  def grid_sample_vjp(primals, cotangent, _):
+      x, grid = primals
+      B, _, _, C = x.shape
+      _, gN, gM, D = grid.shape

-            if (thread_index_in_simdgroup == 0) {
-                atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
-                atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
-            }
-        """
-        kernel = mx.fast.metal_kernel(
-            name="grid_sample_grad",
-            input_names=["x", "grid", "cotangent"],
-            output_names=["x_grad", "grid_grad"],
-            source=source,
-            atomic_outputs=True,
-        )
-        # pad the output channels to simd group size
-        # so that our `simd_sum`s don't overlap.
-        simdgroup_size = 32
-        C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
-        grid_size = B * gN * gM * C_padded
-        outputs = kernel(
-            inputs=[x, grid, cotangent],
-            template=[("T", x.dtype)],
-            output_shapes=[x.shape, grid.shape],
-            output_dtypes=[x.dtype, x.dtype],
-            grid=(grid_size, 1, 1),
-            threadgroup=(256, 1, 1),
-            init_value=0,
-        )
-        return outputs[0], outputs[1]
+      assert D == 2, "Last dim of `grid` must be size 2."
+
+      # pad the output channels to simd group size
+      # so that our `simd_sum`s don't overlap.
+      simdgroup_size = 32
+      C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
+      grid_size = B * gN * gM * C_padded
+      outputs = kernel(
+          inputs=[x, grid, cotangent],
+          template=[("T", x.dtype)],
+          output_shapes=[x.shape, grid.shape],
+          output_dtypes=[x.dtype, x.dtype],
+          grid=(grid_size, 1, 1),
+          threadgroup=(256, 1, 1),
+          init_value=0,
+      )
+      return outputs[0], outputs[1]

 There's an even larger speed up for the vjp:

--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -397,11 +397,11 @@ below.
        std::ostringstream kname;
        kname << "axpby_" << "general_" << type_to_name(out);

-        // Make sure the metal library is available
-        d.register_library("mlx_ext");
+        // Load the metal library
+        auto lib = d.get_library("mlx_ext");

        // Make a kernel from this metal library
-        auto kernel = d.get_kernel(kname.str(), "mlx_ext");
+        auto kernel = d.get_kernel(kname.str(), lib);

        // Prepare to encode kernel
        auto& compute_encoder = d.get_command_encoder(s.index);
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -30,6 +30,16 @@ MLX is also available on conda-forge. To install MLX with conda do:

   conda install conda-forge::mlx

+CUDA
+^^^^
+
+MLX has a CUDA backend which you can use on any Linux platform with CUDA 12
+and SM 7.0 (Volta) and up. To install MLX with CUDA support, run:
+
+.. code-block:: shell
+
+    pip install mlx-cuda
+

 Troubleshooting
 ^^^^^^^^^^^^^^^
@@ -65,6 +75,8 @@ Build Requirements
 Python API
 ^^^^^^^^^^

+.. _python install:
+
 To build and install the MLX python library from source, first, clone MLX from
 `its GitHub repo <https://github.com/ml-explore/mlx>`_:

@@ -76,20 +88,20 @@ Then simply build and install MLX using pip:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install .
+  pip install .

 For developing, install the package with development dependencies, and use an
 editable install:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -e ".[dev]"
+  pip install -e ".[dev]"

 Once the development dependencies are installed, you can build faster with:

 .. code-block:: shell

- CMAKE_BUILD_PARALLEL_LEVEL=8 python setup.py build_ext --inplace
+ python setup.py build_ext --inplace

 Run the tests with:

@@ -107,6 +119,8 @@ IDE:
 C++ API
 ^^^^^^^

+.. _cpp install:
+
 Currently, MLX must be built and installed from source.

 Similarly to the python library, to build and install the MLX C++ library start
@@ -185,6 +199,7 @@ should point to the path to the built metal library.

      xcrun -sdk macosx --show-sdk-version

+
 Binary Size Minimization
 ~~~~~~~~~~~~~~~~~~~~~~~~

@@ -213,6 +228,50 @@ be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
 Metal kernel cache persists across reboots.

+Linux
+^^^^^
+
+To build from source on Linux (CPU only), install the BLAS and LAPACK headers.
+For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   apt-get update -y
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+From here follow the instructions to install either the :ref:`Python <python
+install>` or :ref:`C++ <cpp install>` APIs.
+
+CUDA
+^^^^
+
+To build from source on Linux with CUDA, install the BLAS and LAPACK headers
+and the CUDA toolkit. For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+   dpkg -i cuda-keyring_1.1-1_all.deb
+   apt-get update -y
+   apt-get -y install cuda-toolkit-12-9
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+
+When building either the Python or C++ APIs make sure to pass the cmake flag
+``MLX_BUILD_CUDA=ON``. For example, to build the Python API run:
+
+.. code-block:: shell
+
+  CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"
+
+To build the C++ package run:
+
+.. code-block:: shell
+
+   mkdir -p build && cd build
+   cmake .. -DMLX_BUILD_CUDA=ON && make -j
+
+
 Troubleshooting
 ^^^^^^^^^^^^^^^

--- a/docs/src/python/array.rst
+++ b/docs/src/python/array.rst
@@ -19,6 +19,8 @@ Array
    array.ndim
    array.shape
    array.size
+    array.real
+    array.imag
    array.abs
    array.all
    array.any
--- a/docs/src/python/fft.rst
+++ b/docs/src/python/fft.rst
@@ -20,3 +20,5 @@ FFT
  irfft2
  rfftn
  irfftn
+  fftshift
+  ifftshift
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -16,6 +16,8 @@ Linear Algebra
    cross
    qr
    svd
+    eigvals
+    eig
    eigvalsh
    eigh
    lu
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -107,6 +107,16 @@ same array:
  >>> a
  array([1, 2, 0], dtype=int32)

+
+Note, unlike NumPy, updates to the same location are nondeterministic:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> a[[0, 0]] = mx.array([4, 5])
+
+The first element of ``a`` could be ``4`` or ``5``.
+
 Transformations of functions which use in-place updates are allowed and work as
 expected. For example:

--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -172,11 +172,11 @@ void Axpby::eval_gpu(
  kname << (contiguous_kernel ? "contiguous_" : "general_");
  kname << type_to_name(out);

-  // Make sure the metal library is available
-  d.register_library("mlx_ext");
+  // Load the metal library
+  auto lib = d.get_library("mlx_ext");

  // Make a kernel from this metal library
-  auto kernel = d.get_kernel(kname.str(), "mlx_ext");
+  auto kernel = d.get_kernel(kname.str(), lib);

  // Prepare to encode kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -5,6 +5,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/dtype_utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/export.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
@@ -20,7 +21,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h)

 # Define MLX_VERSION only in the version.cpp file.
-add_library(mlx_version STATIC ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp)
+add_library(mlx_version OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp)
 target_compile_definitions(mlx_version PRIVATE MLX_VERSION="${MLX_VERSION}")
 target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:mlx_version>)

@@ -48,5 +49,19 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
 if(MLX_BUILD_METAL)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/metal)
 else()
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_metal)
+  target_sources(mlx
+                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/no_metal.cpp)
+endif()
+
+if(MLX_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda)
+else()
+  target_sources(mlx
+                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda/no_cuda.cpp)
+endif()
+
+if(MLX_BUILD_METAL OR MLX_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/gpu)
+else()
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_gpu)
 endif()
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -224,6 +224,10 @@ class array {
    // Not copyable
    Data(const Data& d) = delete;
    Data& operator=(const Data& d) = delete;
+    Data(Data&& o) : buffer(o.buffer), d(o.d) {
+      o.buffer = allocator::Buffer(nullptr);
+      o.d = [](allocator::Buffer) {};
+    }
    ~Data() {
      d(buffer);
    }
@@ -339,11 +343,11 @@ class array {
    return allocator::allocator().size(buffer());
  }

-  // Return a copy of the shared pointer
-  // to the array::Data struct
-  std::shared_ptr<Data> data_shared_ptr() const {
+  // Return the shared pointer to the array::Data struct
+  const std::shared_ptr<Data>& data_shared_ptr() const {
    return array_desc_->data;
  }
+
  // Return a raw pointer to the arrays data
  template <typename T>
  T* data() {
@@ -356,7 +360,7 @@ class array {
  }

  enum Status {
-    // The ouptut of a computation which has not been scheduled.
+    // The output of a computation which has not been scheduled.
    // For example, the status of `x` in `auto x = a + b`.
    unscheduled,

--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -1,6 +1,7 @@
 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/broadcasting.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
--- a/mlx/backend/common/broadcasting.cpp
+++ b/mlx/backend/common/broadcasting.cpp
@@ -0,0 +1,24 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+void broadcast(const array& in, array& out) {
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+  Strides strides(out.ndim(), 0);
+  int diff = out.ndim() - in.ndim();
+  for (int i = in.ndim() - 1; i >= 0; --i) {
+    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
+  }
+  auto flags = in.flags();
+  if (out.size() > in.size()) {
+    flags.row_contiguous = flags.col_contiguous = false;
+  }
+  out.copy_shared_buffer(in, strides, flags, in.data_size());
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/broadcasting.h
+++ b/mlx/backend/common/broadcasting.h
@@ -0,0 +1,11 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+
+namespace mlx::core {
+
+void broadcast(const array& in, array& out);
+
+} // namespace mlx::core
--- a/mlx/backend/common/buffer_cache.h
+++ b/mlx/backend/common/buffer_cache.h
@@ -0,0 +1,157 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <cassert>
+#include <functional>
+#include <map>
+
+namespace mlx::core {
+
+template <typename T>
+class BufferCache {
+ public:
+  BufferCache(
+      size_t page_size,
+      std::function<size_t(T*)> get_size,
+      std::function<void(T*)> free)
+      : page_size_(page_size),
+        get_size_(std::move(get_size)),
+        free_(std::move(free)) {}
+
+  ~BufferCache() {
+    clear();
+  }
+
+  BufferCache(const BufferCache&) = delete;
+  BufferCache& operator=(const BufferCache&) = delete;
+
+  T* reuse_from_cache(size_t size) {
+    // Find the closest buffer in pool.
+    auto it = buffer_pool_.lower_bound(size);
+    if (it == buffer_pool_.end() ||
+        it->first >= std::min(2 * size, size + 2 * page_size_)) {
+      return nullptr;
+    }
+
+    // Collect from the cache.
+    T* buf = it->second->buf;
+    pool_size_ -= it->first;
+
+    // Remove from record.
+    remove_from_list(it->second);
+    buffer_pool_.erase(it);
+    return buf;
+  }
+
+  void recycle_to_cache(T* buf) {
+    assert(buf);
+    // Add to cache.
+    BufferHolder* bh = new BufferHolder(buf);
+    add_at_head(bh);
+    size_t size = get_size_(buf);
+    pool_size_ += size;
+    buffer_pool_.emplace(size, bh);
+  }
+
+  int release_cached_buffers(size_t min_bytes_to_free) {
+    if (min_bytes_to_free >= 0.9 * pool_size_) {
+      return clear();
+    } else {
+      int n_release = 0;
+      size_t total_bytes_freed = 0;
+
+      while (tail_ && (total_bytes_freed < min_bytes_to_free)) {
+        // Release buffer.
+        size_t size = get_size_(tail_->buf);
+        total_bytes_freed += size;
+        free_(tail_->buf);
+        n_release++;
+
+        // Remove from record.
+        auto its = buffer_pool_.equal_range(size);
+        auto it = std::find_if(its.first, its.second, [this](const auto& el) {
+          return el.second == tail_;
+        });
+        assert(it != buffer_pool_.end());
+        buffer_pool_.erase(it);
+        remove_from_list(tail_);
+      }
+
+      pool_size_ -= total_bytes_freed;
+      return n_release;
+    }
+  }
+
+  int clear() {
+    int n_release = 0;
+    for (auto& [size, holder] : buffer_pool_) {
+      free_(holder->buf);
+      n_release++;
+      delete holder;
+    }
+    buffer_pool_.clear();
+    pool_size_ = 0;
+    head_ = nullptr;
+    tail_ = nullptr;
+    return n_release;
+  }
+
+  size_t cache_size() const {
+    return pool_size_;
+  }
+
+  size_t page_size() const {
+    return page_size_;
+  }
+
+ private:
+  struct BufferHolder {
+   public:
+    explicit BufferHolder(T* buf_) : buf(buf_) {}
+
+    BufferHolder* prev{nullptr};
+    BufferHolder* next{nullptr};
+    T* buf;
+  };
+
+  void add_at_head(BufferHolder* to_add) {
+    if (!head_) {
+      head_ = to_add;
+      tail_ = to_add;
+    } else {
+      head_->prev = to_add;
+      to_add->next = head_;
+      head_ = to_add;
+    }
+  }
+
+  void remove_from_list(BufferHolder* to_remove) {
+    if (to_remove->prev && to_remove->next) { // if middle
+      to_remove->prev->next = to_remove->next;
+      to_remove->next->prev = to_remove->prev;
+    } else if (to_remove->prev && to_remove == tail_) { // if tail
+      tail_ = to_remove->prev;
+      tail_->next = nullptr;
+    } else if (to_remove == head_ && to_remove->next) { // if head
+      head_ = to_remove->next;
+      head_->prev = nullptr;
+    } else if (to_remove == head_ && to_remove == tail_) { // if only element
+      head_ = nullptr;
+      tail_ = nullptr;
+    }
+
+    delete to_remove;
+  }
+
+  std::multimap<size_t, BufferHolder*> buffer_pool_;
+  BufferHolder* head_{nullptr};
+  BufferHolder* tail_{nullptr};
+  size_t pool_size_{0};
+
+  const size_t page_size_;
+  std::function<size_t(T*)> get_size_;
+  std::function<void(T*)> free_;
+};
+
+} // namespace mlx::core
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -1,6 +1,7 @@
 // Copyright © 2024 Apple Inc.
 #include <cassert>

+#include "mlx/backend/common/broadcasting.h"
 #include "mlx/backend/common/utils.h"
 #include "mlx/primitives.h"

@@ -42,23 +43,6 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {
  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
 }

-void broadcast(const array& in, array& out) {
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-  Strides strides(out.ndim(), 0);
-  int diff = out.ndim() - in.ndim();
-  for (int i = in.ndim() - 1; i >= 0; --i) {
-    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
-  }
-  auto flags = in.flags();
-  if (out.size() > in.size()) {
-    flags.row_contiguous = flags.col_contiguous = false;
-  }
-  out.copy_shared_buffer(in, strides, flags, in.data_size());
-}
-
 void Broadcast::eval(const std::vector<array>& inputs, array& out) {
  broadcast(inputs[0], out);
 }
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -1,8 +1,7 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/backend/common/compiled.h"
-#include "mlx/graph_utils.h"
-#include "mlx/primitives.h"
+#include "mlx/backend/common/utils.h"
 #include "mlx/utils.h"

 namespace mlx::core {
@@ -15,6 +14,8 @@ void print_constant(std::ostream& os, const array& x) {
      return print_float_constant<float16_t>(os, x);
    case bfloat16:
      return print_float_constant<bfloat16_t>(os, x);
+    case float64:
+      return print_float_constant<double>(os, x);
    case complex64:
      return print_complex_constant<complex64_t>(os, x);
    case int8:
@@ -51,6 +52,8 @@ std::string get_type_string(Dtype d) {
      return "float16_t";
    case bfloat16:
      return "bfloat16_t";
+    case float64:
+      return "double";
    case complex64:
      return "complex64_t";
    case bool_:
@@ -79,55 +82,6 @@ std::string get_type_string(Dtype d) {
  }
 }

-std::string build_lib_name(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs,
-    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids) {
-  NodeNamer namer;
-  std::ostringstream os;
-  std::ostringstream constant_hasher;
-
-  // Fill the input names. This is not really necessary, I just like having A,
-  // B, C, ... as the inputs.
-  for (auto& x : inputs) {
-    namer.get_name(x);
-  }
-
-  // The primitives describing the tape. For unary and binary primitives this
-  // must be enough to describe the full computation.
-  for (auto& a : tape) {
-    // name and type of output
-    os << namer.get_name(a) << kindof(a.dtype()) << a.itemsize();
-    // computation performed
-    a.primitive().print(os);
-    // name of inputs to the function
-    for (auto& inp : a.inputs()) {
-      os << namer.get_name(inp);
-    }
-  }
-  os << "_";
-
-  for (auto& x : inputs) {
-    if (constant_ids.find(x.id()) != constant_ids.end()) {
-      os << "C";
-      print_constant(constant_hasher, x);
-    } else {
-      os << (is_scalar(x) ? "S" : "V");
-    }
-  }
-  os << "_";
-  for (auto& x : inputs) {
-    if (constant_ids.find(x.id()) != constant_ids.end()) {
-      continue;
-    }
-    os << kindof(x.dtype()) << x.itemsize();
-  }
-  os << "_" << std::hash<std::string>{}(constant_hasher.str());
-
-  return os.str();
-}
-
 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
    const Shape& shape) {
@@ -159,8 +113,7 @@ bool compiled_check_contiguity(
 void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::vector<array>& inputs_,
-    const std::unordered_set<uintptr_t>& constant_ids_,
+    const std::function<bool(size_t)>& is_constant,
    bool contiguous) {
  if (contiguous) {
    int o = 0;
@@ -175,8 +128,7 @@ void compiled_allocate_outputs(
      // - Donatable
      // - Not a constant
      if (in.itemsize() == outputs[o].itemsize() && !is_scalar(in) &&
-          in.is_donatable() &&
-          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
+          in.is_donatable() && is_constant(i)) {
        outputs[o++].copy_shared_buffer(in);
      }
      // Get representative input flags to properly set non-donated outputs
@@ -204,7 +156,7 @@ void compiled_allocate_outputs(
      // - Not a constant
      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
-          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
+          is_constant(i)) {
        outputs[o].copy_shared_buffer(
            in, outputs[o].strides(), in.flags(), in.data_size());
        o++;
@@ -216,4 +168,74 @@ void compiled_allocate_outputs(
  }
 }

+std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
+    const std::vector<array>& inputs,
+    const array& out,
+    const std::function<bool(size_t)>& is_constant) {
+  const Shape& shape = out.shape();
+  bool contiguous = compiled_check_contiguity(inputs, shape);
+  if (contiguous) {
+    return {true, shape, {}};
+  }
+
+  std::vector<Strides> strides_vec{out.strides()};
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    // Skip constants.
+    if (is_constant(i)) {
+      continue;
+    }
+
+    // Skip scalar inputs.
+    const auto& x = inputs[i];
+    if (is_scalar(x)) {
+      continue;
+    }
+
+    // Broadcast the inputs to the output shape.
+    Strides xstrides;
+    size_t j = 0;
+    for (; j < shape.size() - x.ndim(); ++j) {
+      if (shape[j] == 1) {
+        xstrides.push_back(out.strides()[j]);
+      } else {
+        xstrides.push_back(0);
+      }
+    }
+    for (size_t i = 0; i < x.ndim(); ++i, ++j) {
+      if (x.shape(i) == 1) {
+        if (shape[j] == 1) {
+          xstrides.push_back(out.strides()[j]);
+        } else {
+          xstrides.push_back(0);
+        }
+      } else {
+        xstrides.push_back(x.strides()[i]);
+      }
+    }
+    strides_vec.push_back(std::move(xstrides));
+  }
+
+  auto tup = collapse_contiguous_dims(shape, strides_vec, INT32_MAX);
+  return {false, std::move(std::get<0>(tup)), std::move(std::get<1>(tup))};
+}
+
+bool compiled_use_large_index(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
+    bool contiguous) {
+  if (contiguous) {
+    size_t max_size = 0;
+    for (const auto& in : inputs) {
+      max_size = std::max(max_size, in.data_size());
+    }
+    return max_size > UINT32_MAX;
+  } else {
+    size_t max_size = 0;
+    for (const auto& o : outputs) {
+      max_size = std::max(max_size, o.size());
+    }
+    return max_size > UINT32_MAX;
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/compiled.h
+++ b/mlx/backend/common/compiled.h
@@ -1,9 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.
 #pragma once

+#include <functional>
 #include <iomanip>
-#include <sstream>
-#include <unordered_set>

 #include "mlx/array.h"
 #include "mlx/primitives.h"
@@ -14,19 +13,17 @@ inline bool is_static_cast(const Primitive& p) {
  return (typeid(p) == typeid(Broadcast) || typeid(p) == typeid(AsType));
 }

-std::string build_lib_name(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs,
-    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids);
-
 std::string get_type_string(Dtype d);

 template <typename T>
 void print_float_constant(std::ostream& os, const array& x) {
  auto old_precision = os.precision();
-  os << std::setprecision(std::numeric_limits<float>::digits10 + 1)
-     << x.item<T>() << std::setprecision(old_precision);
+  if constexpr (std::is_same_v<T, double>) {
+    os << std::setprecision(std::numeric_limits<double>::digits10 + 1);
+  } else {
+    os << std::setprecision(std::numeric_limits<float>::digits10 + 1);
+  }
+  os << x.item<T>() << std::setprecision(old_precision);
 }

 template <typename T>
@@ -60,8 +57,19 @@ bool compiled_check_contiguity(
 void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::vector<array>& inputs_,
-    const std::unordered_set<uintptr_t>& constant_ids_,
+    const std::function<bool(size_t)>& is_constant,
+    bool contiguous);
+
+// Collapse contiguous dims ignoring scalars and constants.
+std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
+    const std::vector<array>& inputs,
+    const array& out,
+    const std::function<bool(size_t)>& is_constant);
+
+// Return whether the kernel should use large index.
+bool compiled_use_large_index(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
    bool contiguous);

 } // namespace mlx::core
--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@@ -2,7 +2,7 @@

 #pragma once

-#include "mlx/array.h"
+#include "mlx/backend/common/utils.h"

 namespace mlx::core {

@@ -26,7 +26,7 @@ inline bool set_copy_output_data(const array& in, array& out, CopyType ctype) {
  if (ctype == CopyType::Vector) {
    // If the input is donateable, we are doing a vector copy and the types
    // have the same size, then the input buffer can hold the output.
-    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
+    if (is_donatable(in, out)) {
      out.copy_shared_buffer(in);
      return true;
    } else {
--- a/mlx/backend/common/hadamard.h
+++ b/mlx/backend/common/hadamard.h
@@ -99,7 +99,11 @@ inline std::pair<int, int> decompose_hadamard(int n) {
          "[hadamard] Only supports n = m*2^k where m in (1, 12, 20, 28).");
    }
  }
+  if (n > (1 << 26)) {
+    throw std::invalid_argument(
+        "[hadamard] Only supports n = m*2^k where k <= 26");
+  }
  return {n, m};
 }

-} // namespace mlx::core
+} // namespace mlx::core
--- a/mlx/backend/common/matmul.h
+++ b/mlx/backend/common/matmul.h
@@ -0,0 +1,67 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/utils.h"
+
+#include <sstream>
+
+namespace mlx::core {
+
+inline std::tuple<Shape, Strides, Strides> collapse_batches(
+    const array& a,
+    const array& b) {
+  if (a.ndim() == 2) {
+    return {{1}, {0}, {0}};
+  }
+
+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
+  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
+  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
+
+  auto [batch_shape, batch_strides] =
+      collapse_contiguous_dims(A_bshape, std::vector{A_bstride, B_bstride});
+
+  auto a_batch_strides = batch_strides[0];
+  auto b_batch_strides = batch_strides[1];
+
+  if (batch_shape.empty()) {
+    batch_shape.push_back(1);
+    a_batch_strides.push_back(0);
+    b_batch_strides.push_back(0);
+  }
+
+  return std::make_tuple(batch_shape, a_batch_strides, b_batch_strides);
+}
+
+inline std::tuple<Shape, Strides, Strides, Strides>
+collapse_batches(const array& a, const array& b, const array& c) {
+  if (a.ndim() == 2) {
+    return {{1}, {0}, {0}, {0}};
+  }
+
+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
+  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
+  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
+  Strides C_bstride{c.strides().begin(), c.strides().end() - 2};
+
+  auto [batch_shape, batch_strides] = collapse_contiguous_dims(
+      A_bshape, std::vector{A_bstride, B_bstride, C_bstride});
+
+  auto A_batch_stride = batch_strides[0];
+  auto B_batch_stride = batch_strides[1];
+  auto C_batch_stride = batch_strides[2];
+
+  if (batch_shape.empty()) {
+    batch_shape.push_back(1);
+    A_batch_stride.push_back(0);
+    B_batch_stride.push_back(0);
+    C_batch_stride.push_back(0);
+  }
+
+  return std::make_tuple(
+      batch_shape, A_batch_stride, B_batch_stride, C_batch_stride);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -5,11 +5,9 @@
 namespace mlx::core {

 std::pair<Shape, Strides> shapes_without_reduction_axes(
-    const array& x,
+    Shape shape,
+    Strides strides,
    const std::vector<int>& axes) {
-  auto shape = x.shape();
-  auto strides = x.strides();
-
  for (int i = axes.size() - 1; i >= 0; i--) {
    int a = axes[i];
    shape.erase(shape.begin() + a);
@@ -19,6 +17,15 @@ std::pair<Shape, Strides> shapes_without_reduction_axes(
  return std::make_pair(shape, strides);
 }

+std::pair<Shape, Strides> shapes_without_reduction_axes(
+    const array& x,
+    const std::vector<int>& axes) {
+  auto shape = x.shape();
+  auto strides = x.strides();
+  return shapes_without_reduction_axes(
+      std::move(shape), std::move(strides), axes);
+}
+
 ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // The data is all there and we are reducing over everything
  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -51,5 +51,9 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);
 std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes);
+std::pair<Shape, Strides> shapes_without_reduction_axes(
+    Shape shape,
+    Strides strides,
+    const std::vector<int>& axes);

 } // namespace mlx::core
--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@@ -0,0 +1,26 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+inline void set_unary_output_data(const array& in, array& out) {
+  if (in.flags().contiguous) {
+    if (is_donatable(in, out)) {
+      out.copy_shared_buffer(in);
+    } else {
+      out.set_data(
+          allocator::malloc(in.data_size() * out.itemsize()),
+          in.data_size(),
+          in.strides(),
+          in.flags());
+    }
+  } else {
+    out.set_data(allocator::malloc(out.nbytes()));
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@@ -1,9 +1,16 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/backend/common/utils.h"
+#include "mlx/primitives.h"

 namespace mlx::core {

+std::string get_primitive_string(Primitive* primitive) {
+  std::ostringstream op_t;
+  primitive->print(op_t);
+  return op_t.str();
+}
+
 std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    const Shape& shape,
    const std::vector<Strides>& strides,
@@ -101,4 +108,118 @@ std::pair<Shape, Strides> collapse_contiguous_dims(
  return collapse_contiguous_dims(a.shape(), a.strides(), size_cap);
 }

+Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 /* = 10 */) {
+  int pows[3] = {0, 0, 0};
+  int sum = 0;
+  while (true) {
+    int presum = sum;
+    // Check all the pows
+    if (dim0 >= (1 << (pows[0] + 1))) {
+      pows[0]++;
+      sum++;
+    }
+    if (sum == 10) {
+      break;
+    }
+    if (dim1 >= (1 << (pows[1] + 1))) {
+      pows[1]++;
+      sum++;
+    }
+    if (sum == 10) {
+      break;
+    }
+    if (dim2 >= (1 << (pows[2] + 1))) {
+      pows[2]++;
+      sum++;
+    }
+    if (sum == presum || sum == pow2) {
+      break;
+    }
+  }
+  return std::make_tuple(1ul << pows[0], 1ul << pows[1], 1ul << pows[2]);
+}
+
+Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides) {
+  // Dims with strides of 0 are ignored as they
+  // correspond to broadcasted dimensions
+  size_t grid_x = 1;
+  size_t grid_y = 1;
+  for (int i = 0; i < shape.size(); ++i) {
+    if (strides[i] == 0) {
+      continue;
+    }
+    if (grid_x * shape[i] < UINT32_MAX) {
+      grid_x *= shape[i];
+    } else {
+      grid_y *= shape[i];
+    }
+  }
+  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
+    throw std::runtime_error("Unable to safely factor shape.");
+  }
+  if (grid_y > grid_x) {
+    std::swap(grid_x, grid_y);
+  }
+  return std::make_tuple(
+      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
+}
+
+Dims get_2d_grid_dims_common(
+    const Shape& shape,
+    const Strides& strides,
+    size_t divisor) {
+  // Compute the 2d grid dimensions such that the total size of the grid is
+  // divided by divisor.
+  size_t grid_x = 1;
+  size_t grid_y = 1;
+  for (int i = 0; i < shape.size(); ++i) {
+    if (strides[i] == 0) {
+      continue;
+    }
+
+    // No need to add this shape we can just remove it from the divisor.
+    if (divisor % shape[i] == 0) {
+      divisor /= shape[i];
+      continue;
+    }
+
+    if (grid_x * shape[i] < UINT32_MAX) {
+      grid_x *= shape[i];
+    } else {
+      grid_y *= shape[i];
+    }
+
+    if (divisor > 1) {
+      if (grid_x % divisor == 0) {
+        grid_x /= divisor;
+        divisor = 1;
+      } else if (grid_y % divisor == 0) {
+        grid_y /= divisor;
+        divisor = 1;
+      }
+    }
+  }
+  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
+    throw std::runtime_error("Unable to safely factor shape.");
+  }
+  if (grid_y > grid_x) {
+    std::swap(grid_x, grid_y);
+  }
+  if (divisor > 1) {
+    grid_x = ((grid_x + divisor - 1) / divisor) * divisor;
+  }
+  return std::make_tuple(
+      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
+}
+
+std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2) {
+  auto [bx, by, bz] = get_block_dims_common(dim0, dim1, dim2);
+  auto gx = (dim0 + bx - 1) / bx;
+  auto gy = (dim1 + by - 1) / by;
+  auto gz = (dim2 + bz - 1) / bz;
+
+  return std::make_pair(
+      std::make_tuple(gx, gy, gz), std::make_tuple(bx, by, bz));
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -2,12 +2,15 @@

 #pragma once

+#include <tuple>
 #include <vector>

 #include "mlx/array.h"

 namespace mlx::core {

+std::string get_primitive_string(Primitive* primitive);
+
 inline int64_t
 elem_to_loc(int elem, const Shape& shape, const Strides& strides) {
  int64_t loc = 0;
@@ -70,6 +73,31 @@ std::pair<Shape, Strides> collapse_contiguous_dims(
    const array& a,
    int64_t size_cap = std::numeric_limits<int32_t>::max());

+// Compute the thread block dimensions which fit the given
+// input dimensions.
+// - The thread block dimensions will be powers of two
+// - The thread block size will be less than 2^pow2
+using Dims = std::tuple<uint32_t, uint32_t, uint32_t>;
+Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 = 10);
+
+// Computes a 2D grid where each element is < UINT_MAX
+// Assumes:
+// - overall size (product of non-broadcasted dimensions) is < UINT_MAX^2
+// - shape and strides correspond to a contiguous (no holes) but
+//   possibly broadcasted array
+Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides);
+
+// Same as above but we do an implicit division with divisor.
+// Basically, equivalent to factorizing
+//    Prod(s \forall s in shape if strides[s] > 0) / divisor.
+Dims get_2d_grid_dims_common(
+    const Shape& shape,
+    const Strides& strides,
+    size_t divisor);
+
+// Get both the block and a grid of blocks that covers dim0, dim1 and dim2.
+std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2);
+
 struct ContiguousIterator {
  inline void step() {
    int dims = shape_.size();
@@ -165,4 +193,11 @@ void shared_buffer_reshape(
    const array& in,
    const Strides& out_strides,
    array& out);
+
+template <typename T>
+inline std::vector<T> remove_index(std::vector<T> vec, size_t index) {
+  vec.erase(std::next(vec.begin(), index));
+  return vec;
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/CMakeLists.txt
+++ b/mlx/backend/cpu/CMakeLists.txt
@@ -40,11 +40,13 @@ add_dependencies(mlx cpu_compiled_preamble)

 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/available.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/eig.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/encoder.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
--- a/mlx/backend/cpu/arg_reduce.cpp
+++ b/mlx/backend/cpu/arg_reduce.cpp
@@ -14,10 +14,8 @@ template <typename InT, typename OpT>
 void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto axis_size = in.shape()[axis];
  auto axis_stride = in.strides()[axis];
-  Strides strides = in.strides();
-  Shape shape = in.shape();
-  strides.erase(strides.begin() + axis);
-  shape.erase(shape.begin() + axis);
+  Strides strides = remove_index(in.strides(), axis);
+  Shape shape = remove_index(in.shape(), axis);
  auto in_ptr = in.data<InT>();
  auto out_ptr = out.data<uint32_t>();

--- a/mlx/backend/cpu/available.cpp
+++ b/mlx/backend/cpu/available.cpp
@@ -0,0 +1,11 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cpu/available.h"
+
+namespace mlx::core::cpu {
+
+bool is_available() {
+  return true;
+}
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/available.h
+++ b/mlx/backend/cpu/available.h
@@ -0,0 +1,9 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+namespace mlx::core::cpu {
+
+bool is_available();
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/binary.cpp
+++ b/mlx/backend/cpu/binary.cpp
@@ -172,9 +172,12 @@ void binary_float(
      case bfloat16:
        binary_op<bfloat16_t, Op>(a, b, out, bopt);
        break;
+      case complex64:
+        binary_op<complex64_t, Op>(a, b, out, bopt);
+        break;
      default:
        throw std::runtime_error(
-            "[binary_float] Only supports non-complex floating point types.");
+            "[binary_float] Only supports floating point types.");
    }
  });
 }
--- a/mlx/backend/cpu/compiled.cpp
+++ b/mlx/backend/cpu/compiled.cpp
@@ -40,7 +40,10 @@ struct CompilerCache {
  std::shared_mutex mtx;
 };

-static CompilerCache cache{};
+static CompilerCache& cache() {
+  static CompilerCache cache_;
+  return cache_;
+};

 // GPU compile is always available if the GPU is available and since we are in
 // this file CPU compile is also available.
@@ -56,14 +59,16 @@ void* compile(
    const std::string& kernel_name,
    const std::function<std::string(void)>& source_builder) {
  {
-    std::shared_lock lock(cache.mtx);
-    if (auto it = cache.kernels.find(kernel_name); it != cache.kernels.end()) {
+    std::shared_lock lock(cache().mtx);
+    if (auto it = cache().kernels.find(kernel_name);
+        it != cache().kernels.end()) {
      return it->second;
    }
  }

-  std::unique_lock lock(cache.mtx);
-  if (auto it = cache.kernels.find(kernel_name); it != cache.kernels.end()) {
+  std::unique_lock lock(cache().mtx);
+  if (auto it = cache().kernels.find(kernel_name);
+      it != cache().kernels.end()) {
    return it->second;
  }
  std::string source_code = source_builder();
@@ -120,10 +125,10 @@ void* compile(
  }

  // load library
-  cache.libs.emplace_back(shared_lib_path);
+  cache().libs.emplace_back(shared_lib_path);

  // Load function
-  void* fun = dlsym(cache.libs.back().lib, kernel_name.c_str());
+  void* fun = dlsym(cache().libs.back().lib, kernel_name.c_str());
  if (!fun) {
    std::ostringstream msg;
    msg << "[Compile::eval_cpu] Failed to load compiled function "
@@ -131,7 +136,7 @@ void* compile(
        << dlerror();
    throw std::runtime_error(msg.str());
  }
-  cache.kernels.insert({kernel_name, fun});
+  cache().kernels.insert({kernel_name, fun});
  return fun;
 }

@@ -141,18 +146,9 @@ inline void build_kernel(
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids,
+    const std::function<bool(size_t)>& is_constant,
    bool contiguous,
    int ndim) {
-  // All outputs should have the exact same shape and will be row contiguous
-  auto output_shape = outputs[0].shape();
-  auto output_strides = outputs[0].strides();
-
-  // Constants are scalars that are captured by value and cannot change
-  auto is_constant = [&constant_ids](const array& x) {
-    return constant_ids.find(x.id()) != constant_ids.end();
-  };
-
  NodeNamer namer;

 #ifdef _MSC_VER
@@ -165,14 +161,15 @@ inline void build_kernel(

  // Add the input arguments
  int cnt = 0;
-  for (auto& x : inputs) {
-    auto& xname = namer.get_name(x);
-
+  for (size_t i = 0; i < inputs.size(); ++i) {
    // Skip constants from the input list
-    if (is_constant(x)) {
+    if (is_constant(i)) {
      continue;
    }

+    const auto& x = inputs[i];
+    auto& xname = namer.get_name(x);
+
    auto tstr = get_type_string(x.dtype());
    os << "  " << tstr << "* " << xname << " = (" << tstr << "*)args[" << cnt++
       << "];" << std::endl;
@@ -206,10 +203,11 @@ inline void build_kernel(
  }

  // Read the inputs in tmps
-  for (auto& x : inputs) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const auto& x = inputs[i];
    auto& xname = namer.get_name(x);

-    if (is_constant(x)) {
+    if (is_constant(i)) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = ";
      print_constant(os, x);
      os << ";" << std::endl;
@@ -259,8 +257,9 @@ inline void build_kernel(
  } else {
    for (int d = ndim - 1; d >= 0; --d) {
      // Update pointers
-      for (auto& x : inputs) {
-        if (is_constant(x) || is_scalar(x)) {
+      for (size_t i = 0; i < inputs.size(); ++i) {
+        const auto& x = inputs[i];
+        if (is_constant(i) || is_scalar(x)) {
          continue;
        }
        auto& xname = namer.get_name(x);
@@ -282,65 +281,37 @@ inline void build_kernel(
 void Compiled::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
-  if (kernel_lib_.empty()) {
-    kernel_lib_ = build_lib_name(inputs_, outputs_, tape_, constant_ids_);
-  }
-
-  // Figure out which kernel we are using
-  auto& shape = outputs[0].shape();
-  auto contiguous = compiled_check_contiguity(inputs, shape);
  auto& encoder = cpu::get_command_encoder(stream());

-  // Handle all broadcasting and collect function input arguments
+  // Collapse contiguous dims to route to a faster kernel if possible. Also
+  // handle all broadcasting.
+  auto [contiguous, shape, strides] =
+      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);
+
+  // Collect function input arguments.
  std::vector<void*> args;
-  std::vector<std::vector<size_t>> strides;
-  for (int i = 0; i < inputs.size(); i++) {
-    // Skip constants.
-    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
+  int strides_index = 1;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (is_constant_(i)) {
      continue;
    }
-    auto& x = inputs[i];
+    const auto& x = inputs[i];
    encoder.set_input_array(x);
    args.push_back((void*)x.data<void>());
-
-    if (contiguous || is_scalar(x)) {
-      continue;
+    if (!contiguous && !is_scalar(x)) {
+      args.push_back(strides[strides_index++].data());
    }
-
-    // Broadcast the input to the output shape.
-    std::vector<size_t> xstrides;
-    int j = 0;
-    for (; j < shape.size() - x.ndim(); j++) {
-      if (shape[j] == 1) {
-        xstrides.push_back(outputs[0].strides()[j]);
-      } else {
-        xstrides.push_back(0);
-      }
-    }
-    for (int i = 0; i < x.ndim(); i++, j++) {
-      if (x.shape(i) == 1) {
-        if (shape[j] == 1) {
-          xstrides.push_back(outputs[0].strides()[j]);
-        } else {
-          xstrides.push_back(0);
-        }
-      } else {
-        xstrides.push_back(x.strides()[i]);
-      }
-    }
-    strides.push_back(std::move(xstrides));
-    args.push_back(strides.back().data());
  }

  // Get the kernel name from the lib
  int ndim = shape.size();
  auto kernel_name = kernel_lib_ + (contiguous ? "_contiguous" : "_strided_");
  if (!contiguous) {
-    kernel_name += std::to_string(shape.size());
+    kernel_name += std::to_string(ndim);
  }

  // Get the function
-  auto fn_ptr = compile(kernel_name, [&]() {
+  auto fn_ptr = compile(kernel_name, [&, contiguous = contiguous]() {
    std::ostringstream kernel;
    kernel << get_kernel_preamble() << std::endl;
    kernel << "extern \"C\"  {" << std::endl;
@@ -350,7 +321,7 @@ void Compiled::eval_cpu(
        inputs_,
        outputs_,
        tape_,
-        constant_ids_,
+        is_constant_,
        contiguous,
        ndim);
    // Close extern "C"
@@ -358,26 +329,22 @@ void Compiled::eval_cpu(
    return kernel.str();
  });

-  compiled_allocate_outputs(
-      inputs, outputs, inputs_, constant_ids_, contiguous);
+  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);

  for (auto& x : outputs) {
    args.push_back(x.data<void>());
    encoder.set_output_array(x);
  }
-  Shape out_shape;
  if (!contiguous) {
-    out_shape = outputs[0].shape();
-    args.push_back((void*)out_shape.data());
+    args.push_back((void*)shape.data());
  } else {
    args.push_back((void*)outputs[0].data_size());
  }
  auto fun = (void (*)(void**))fn_ptr;
-  encoder.dispatch(
-      [fun,
-       args = std::move(args),
-       strides = std::move(strides),
-       out_shape = std::move(out_shape)]() mutable { fun(args.data()); });
+  encoder.dispatch([fun,
+                    args = std::move(args),
+                    strides = std::move(strides),
+                    shape = std::move(shape)]() mutable { fun(args.data()); });
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/conv.cpp
+++ b/mlx/backend/cpu/conv.cpp
@@ -22,7 +22,8 @@ void slow_conv_1D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@@ -60,7 +61,8 @@ void slow_conv_1D(
                    out_stride_O = out.strides()[2],

                    flip,
-                    padding = padding[0],
+                    padding_lo = padding_lo[0],
+                    padding_hi = padding_hi[0],
                    wt_stride = wt_strides[0],
                    wt_dilation = wt_dilation[0],
                    in_dilation = in_dilation[0]]() mutable {
@@ -77,7 +79,7 @@ void slow_conv_1D(
              const T* wt_ptr = filter_wt_ptr + wh * wt_stride_H;

              int wh_flip = flip ? (wH - wh - 1) : wh;
-              int ih = oh * wt_stride - padding + wh_flip * wt_dilation;
+              int ih = oh * wt_stride - padding_lo + wh_flip * wt_dilation;

              auto ih_div = std::div(ih, in_dilation);

@@ -109,7 +111,8 @@ void slow_conv_2D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@@ -120,230 +123,235 @@ void slow_conv_2D(
  encoder.set_input_array(wt);
  encoder.set_output_array(out);

-  encoder.dispatch([st_wt_ptr = wt.data<T>(),
-                    st_in_ptr = in.data<T>(),
-                    st_out_ptr = out.data<T>(),
+  encoder.dispatch(
+      [st_wt_ptr = wt.data<T>(),
+       st_in_ptr = in.data<T>(),
+       st_out_ptr = out.data<T>(),

-                    N = in.shape(
-                        0), // Batch size, should be the same as out.shape(0)
-                    iH = 1 +
-                        in_dilation[0] * (in.shape(1) - 1), // Input spatial dim
-                    iW = 1 +
-                        in_dilation[1] * (in.shape(2) - 1), // Input spatial dim
-                    C = in.shape(3), // In channels
-                    oH = out.shape(1), // Output spatial dim
-                    oW = out.shape(2), // Output spatial dim
-                    O = wt.shape(0), // Out channels
-                    wH = wt.shape(1), // Weight spatial dim
-                    wW = wt.shape(2), // Weight spatial dim
+       N = in.shape(0), // Batch size, should be the same as out.shape(0)
+       iH = 1 + in_dilation[0] * (in.shape(1) - 1), // Input spatial dim
+       iW = 1 + in_dilation[1] * (in.shape(2) - 1), // Input spatial dim
+       C = in.shape(3), // In channels
+       oH = out.shape(1), // Output spatial dim
+       oW = out.shape(2), // Output spatial dim
+       O = wt.shape(0), // Out channels
+       wH = wt.shape(1), // Weight spatial dim
+       wW = wt.shape(2), // Weight spatial dim

-                    groups = in.shape(3) / wt.shape(3),
-                    C_per_group = wt.shape(3),
+       groups = in.shape(3) / wt.shape(3),
+       C_per_group = wt.shape(3),

-                    in_stride_N = in.strides()[0],
-                    in_stride_H = in.strides()[1],
-                    in_stride_W = in.strides()[2],
-                    in_stride_C = in.strides()[3],
+       in_stride_N = in.strides()[0],
+       in_stride_H = in.strides()[1],
+       in_stride_W = in.strides()[2],
+       in_stride_C = in.strides()[3],

-                    wt_stride_O = wt.strides()[0],
-                    wt_stride_H = wt.strides()[1],
-                    wt_stride_W = wt.strides()[2],
-                    wt_stride_C = wt.strides()[3],
+       wt_stride_O = wt.strides()[0],
+       wt_stride_H = wt.strides()[1],
+       wt_stride_W = wt.strides()[2],
+       wt_stride_C = wt.strides()[3],

-                    out_stride_N = out.strides()[0],
-                    out_stride_H = out.strides()[1],
-                    out_stride_W = out.strides()[2],
-                    out_stride_O = out.strides()[3],
+       out_stride_N = out.strides()[0],
+       out_stride_H = out.strides()[1],
+       out_stride_W = out.strides()[2],
+       out_stride_O = out.strides()[3],

-                    padding,
-                    wt_strides,
-                    wt_dilation,
-                    in_dilation,
-                    flip]() mutable {
-    bool is_idil_one = in_dilation[0] == 1 && in_dilation[1] == 1;
+       padding_lo,
+       padding_hi,
+       wt_strides,
+       wt_dilation,
+       in_dilation,
+       flip]() mutable {
+        bool is_idil_one = in_dilation[0] == 1 && in_dilation[1] == 1;

-    const int O_per_group = O / groups;
-    auto pt_conv_no_checks = [&](const T* in_ptr,
-                                 const T* wt_ptr,
-                                 T* out_ptr,
-                                 int oh,
-                                 int ow) {
-      out_ptr += oh * out_stride_H + ow * out_stride_W;
-      int ih_base = oh * wt_strides[0] - padding[0];
-      int iw_base = ow * wt_strides[1] - padding[1];
+        const int O_per_group = O / groups;
+        auto pt_conv_no_checks =
+            [&](const T* in_ptr, const T* wt_ptr, T* out_ptr, int oh, int ow) {
+              out_ptr += oh * out_stride_H + ow * out_stride_W;
+              int ih_base = oh * wt_strides[0] - padding_lo[0];
+              int iw_base = ow * wt_strides[1] - padding_lo[1];

-      for (int g = 0; g < groups; ++g) {
-        for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
-          float r = 0.;
+              for (int g = 0; g < groups; ++g) {
+                for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
+                  float r = 0.;

-          for (int wh = 0; wh < wH; ++wh) {
-            for (int ww = 0; ww < wW; ++ww) {
-              int wh_flip = flip ? wH - wh - 1 : wh;
-              int ww_flip = flip ? wW - ww - 1 : ww;
-              int ih = ih_base + wh_flip * wt_dilation[0];
-              int iw = iw_base + ww_flip * wt_dilation[1];
+                  for (int wh = 0; wh < wH; ++wh) {
+                    for (int ww = 0; ww < wW; ++ww) {
+                      int wh_flip = flip ? wH - wh - 1 : wh;
+                      int ww_flip = flip ? wW - ww - 1 : ww;
+                      int ih = ih_base + wh_flip * wt_dilation[0];
+                      int iw = iw_base + ww_flip * wt_dilation[1];

-              const T* wt_ptr_pt = wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
-              const T* in_ptr_pt = in_ptr + ih * in_stride_H + iw * in_stride_W;
+                      const T* wt_ptr_pt =
+                          wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
+                      const T* in_ptr_pt =
+                          in_ptr + ih * in_stride_H + iw * in_stride_W;

-              for (int c = g * C_per_group; c < (g + 1) * C_per_group; ++c) {
-                r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
-                    static_cast<float>(
-                         wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
-              } // c
-            } // ww
-          } // wh
+                      for (int c = g * C_per_group; c < (g + 1) * C_per_group;
+                           ++c) {
+                        r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
+                            static_cast<float>(
+                                 wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
+                      } // c
+                    } // ww
+                  } // wh

-          out_ptr[0] = static_cast<T>(r);
-          out_ptr += out_stride_O;
-          wt_ptr += wt_stride_O;
-        } // o
-      } // g
-    };
+                  out_ptr[0] = static_cast<T>(r);
+                  out_ptr += out_stride_O;
+                  wt_ptr += wt_stride_O;
+                } // o
+              } // g
+            };

-    int jump_h = flip ? -wt_dilation[0] : wt_dilation[0];
-    int jump_w = flip ? -wt_dilation[1] : wt_dilation[1];
+        int jump_h = flip ? -wt_dilation[0] : wt_dilation[0];
+        int jump_w = flip ? -wt_dilation[1] : wt_dilation[1];

-    int init_h = (flip ? (wH - 1) * wt_dilation[0] : 0);
-    int init_w = (flip ? (wW - 1) * wt_dilation[1] : 0);
+        int init_h = (flip ? (wH - 1) * wt_dilation[0] : 0);
+        int init_w = (flip ? (wW - 1) * wt_dilation[1] : 0);

-    int f_wgt_jump_h =
-        std::lcm(in_dilation[0], wt_dilation[0]) / wt_dilation[0];
-    int f_wgt_jump_w =
-        std::lcm(in_dilation[1], wt_dilation[1]) / wt_dilation[1];
+        int f_wgt_jump_h =
+            std::lcm(in_dilation[0], wt_dilation[0]) / wt_dilation[0];
+        int f_wgt_jump_w =
+            std::lcm(in_dilation[1], wt_dilation[1]) / wt_dilation[1];

-    int f_out_jump_h = std::lcm(in_dilation[0], wt_strides[0]) / wt_strides[0];
-    int f_out_jump_w = std::lcm(in_dilation[1], wt_strides[1]) / wt_strides[1];
+        int f_out_jump_h =
+            std::lcm(in_dilation[0], wt_strides[0]) / wt_strides[0];
+        int f_out_jump_w =
+            std::lcm(in_dilation[1], wt_strides[1]) / wt_strides[1];

-    std::vector<int> base_h(f_out_jump_h);
-    std::vector<int> base_w(f_out_jump_w);
+        std::vector<int> base_h(f_out_jump_h);
+        std::vector<int> base_w(f_out_jump_w);

-    for (int i = 0; i < f_out_jump_h; ++i) {
-      int ih_loop = i * wt_strides[0] - padding[0] + init_h;
+        for (int i = 0; i < f_out_jump_h; ++i) {
+          int ih_loop = i * wt_strides[0] - padding_lo[0] + init_h;

-      int wh_base = 0;
-      while (wh_base < wH && ih_loop % in_dilation[0] != 0) {
-        wh_base++;
-        ih_loop += jump_h;
-      }
+          int wh_base = 0;
+          while (wh_base < wH && ih_loop % in_dilation[0] != 0) {
+            wh_base++;
+            ih_loop += jump_h;
+          }

-      base_h[i] = wh_base;
-    }
+          base_h[i] = wh_base;
+        }

-    for (int j = 0; j < f_out_jump_w; ++j) {
-      int iw_loop = j * wt_strides[1] - padding[1] + init_w;
+        for (int j = 0; j < f_out_jump_w; ++j) {
+          int iw_loop = j * wt_strides[1] - padding_lo[1] + init_w;

-      int ww_base = 0;
-      while (ww_base < wW && iw_loop % in_dilation[1] != 0) {
-        ww_base++;
-        iw_loop += jump_w;
-      }
+          int ww_base = 0;
+          while (ww_base < wW && iw_loop % in_dilation[1] != 0) {
+            ww_base++;
+            iw_loop += jump_w;
+          }

-      base_w[j] = ww_base;
-    }
+          base_w[j] = ww_base;
+        }

-    auto pt_conv_all_checks =
-        [&](const T* in_ptr, const T* wt_ptr, T* out_ptr, int oh, int ow) {
-          out_ptr += oh * out_stride_H + ow * out_stride_W;
+        auto pt_conv_all_checks =
+            [&](const T* in_ptr, const T* wt_ptr, T* out_ptr, int oh, int ow) {
+              out_ptr += oh * out_stride_H + ow * out_stride_W;

-          int ih_base = oh * wt_strides[0] - padding[0];
-          int iw_base = ow * wt_strides[1] - padding[1];
+              int ih_base = oh * wt_strides[0] - padding_lo[0];
+              int iw_base = ow * wt_strides[1] - padding_lo[1];

-          int wh_base = base_h[oh % f_out_jump_h];
-          int ww_base = base_w[ow % f_out_jump_w];
+              int wh_base = base_h[oh % f_out_jump_h];
+              int ww_base = base_w[ow % f_out_jump_w];

-          for (int g = 0; g < groups; ++g) {
-            for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
-              float r = 0.;
+              for (int g = 0; g < groups; ++g) {
+                for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
+                  float r = 0.;

-              for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
-                for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
-                  int wh_flip = flip ? wH - wh - 1 : wh;
-                  int ww_flip = flip ? wW - ww - 1 : ww;
-                  int ih = ih_base + wh_flip * wt_dilation[0];
-                  int iw = iw_base + ww_flip * wt_dilation[1];
+                  for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
+                    for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
+                      int wh_flip = flip ? wH - wh - 1 : wh;
+                      int ww_flip = flip ? wW - ww - 1 : ww;
+                      int ih = ih_base + wh_flip * wt_dilation[0];
+                      int iw = iw_base + ww_flip * wt_dilation[1];

-                  if (ih >= 0 && ih < iH && iw >= 0 && iw < iW) {
-                    const T* wt_ptr_pt =
-                        wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
+                      if (ih >= 0 && ih < iH && iw >= 0 && iw < iW) {
+                        const T* wt_ptr_pt =
+                            wt_ptr + wh * wt_stride_H + ww * wt_stride_W;

-                    int ih_dil = !is_idil_one ? (ih / in_dilation[0]) : ih;
-                    int iw_dil = !is_idil_one ? (iw / in_dilation[1]) : iw;
+                        int ih_dil = !is_idil_one ? (ih / in_dilation[0]) : ih;
+                        int iw_dil = !is_idil_one ? (iw / in_dilation[1]) : iw;

-                    const T* in_ptr_pt =
-                        in_ptr + ih_dil * in_stride_H + iw_dil * in_stride_W;
+                        const T* in_ptr_pt = in_ptr + ih_dil * in_stride_H +
+                            iw_dil * in_stride_W;

-                    for (int c = g * C_per_group; c < (g + 1) * C_per_group;
-                         ++c) {
-                      r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
-                          static_cast<float>(
-                               wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
-                    } // c
+                        for (int c = g * C_per_group; c < (g + 1) * C_per_group;
+                             ++c) {
+                          r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
+                              static_cast<float>(
+                                   wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
+                        } // c

-                  } // ih, iw check
-                } // ww
-              } // wh
+                      } // ih, iw check
+                    } // ww
+                  } // wh

-              out_ptr[0] = static_cast<T>(r);
-              out_ptr += out_stride_O;
-              wt_ptr += wt_stride_O;
-            } // o
-          } // g
-        };
+                  out_ptr[0] = static_cast<T>(r);
+                  out_ptr += out_stride_O;
+                  wt_ptr += wt_stride_O;
+                } // o
+              } // g
+            };

-    int oH_border_0 = 0;
-    int oH_border_1 =
-        is_idil_one ? ((padding[0] + wt_strides[0] - 1) / wt_strides[0]) : oH;
-    int oH_border_2 = std::max(
-        oH_border_1, (iH + padding[0] - wH * wt_dilation[0]) / wt_strides[0]);
-    int oH_border_3 = oH;
+        int oH_border_0 = 0;
+        int oH_border_1 = is_idil_one
+            ? ((padding_lo[0] + wt_strides[0] - 1) / wt_strides[0])
+            : oH;
+        int oH_border_2 = std::max(
+            oH_border_1,
+            (iH + padding_lo[0] - wH * wt_dilation[0]) / wt_strides[0]);
+        int oH_border_3 = oH;

-    int oW_border_0 = 0;
-    int oW_border_1 =
-        is_idil_one ? ((padding[1] + wt_strides[1] - 1) / wt_strides[1]) : oW;
-    int oW_border_2 = std::max(
-        oW_border_1, (iW + padding[1] - wW * wt_dilation[1]) / wt_strides[1]);
-    int oW_border_3 = oW;
+        int oW_border_0 = 0;
+        int oW_border_1 = is_idil_one
+            ? ((padding_lo[1] + wt_strides[1] - 1) / wt_strides[1])
+            : oW;
+        int oW_border_2 = std::max(
+            oW_border_1,
+            (iW + padding_lo[1] - wW * wt_dilation[1]) / wt_strides[1]);
+        int oW_border_3 = oW;

-    for (int n = 0; n < N; ++n) {
-      // Case 1: oh might put us out of bounds
-      for (int oh = oH_border_0; oh < oH_border_1; ++oh) {
-        for (int ow = 0; ow < oW; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
-        } // ow
-      } // oh
+        for (int n = 0; n < N; ++n) {
+          // Case 1: oh might put us out of bounds
+          for (int oh = oH_border_0; oh < oH_border_1; ++oh) {
+            for (int ow = 0; ow < oW; ++ow) {
+              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
+            } // ow
+          } // oh

-      // Case 2: oh in bounds
-      for (int oh = oH_border_1; oh < oH_border_2; ++oh) {
-        // Case a: ow might put us out of bounds
-        for (int ow = oW_border_0; ow < oW_border_1; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
-        } // ow
+          // Case 2: oh in bounds
+          for (int oh = oH_border_1; oh < oH_border_2; ++oh) {
+            // Case a: ow might put us out of bounds
+            for (int ow = oW_border_0; ow < oW_border_1; ++ow) {
+              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
+            } // ow

-        // Case b: ow in bounds
-        for (int ow = oW_border_1; ow < oW_border_2; ++ow) {
-          pt_conv_no_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
-        } // ow
+            // Case b: ow in bounds
+            for (int ow = oW_border_1; ow < oW_border_2; ++ow) {
+              pt_conv_no_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
+            } // ow

-        // Case c: ow might put us out of bounds
-        for (int ow = oW_border_2; ow < oW_border_3; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
-        } // ow
+            // Case c: ow might put us out of bounds
+            for (int ow = oW_border_2; ow < oW_border_3; ++ow) {
+              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
+            } // ow

-      } // oh
+          } // oh

-      // Case 3: oh might put us out of bounds
-      for (int oh = oH_border_2; oh < oH_border_3; ++oh) {
-        for (int ow = 0; ow < oW; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
-        } // ow
-      } // oh
+          // Case 3: oh might put us out of bounds
+          for (int oh = oH_border_2; oh < oH_border_3; ++oh) {
+            for (int ow = 0; ow < oW; ++ow) {
+              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
+            } // ow
+          } // oh

-      st_in_ptr += in_stride_N;
-      st_out_ptr += out_stride_N;
+          st_in_ptr += in_stride_N;
+          st_out_ptr += out_stride_N;

-    } // n
-  });
+        } // n
+      });
 }

 template <typename T>
@@ -351,7 +359,8 @@ void slow_conv_3D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@@ -400,7 +409,8 @@ void slow_conv_3D(
                    out_stride_H = out.strides()[2],
                    out_stride_W = out.strides()[3],
                    out_stride_O = out.strides()[4],
-                    padding,
+                    padding_lo,
+                    padding_hi,
                    wt_strides,
                    wt_dilation,
                    in_dilation,
@@ -415,9 +425,9 @@ void slow_conv_3D(
                                 int oh,
                                 int ow) {
      out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;
-      int id_base = od * wt_strides[0] - padding[0];
-      int ih_base = oh * wt_strides[1] - padding[1];
-      int iw_base = ow * wt_strides[2] - padding[2];
+      int id_base = od * wt_strides[0] - padding_lo[0];
+      int ih_base = oh * wt_strides[1] - padding_lo[1];
+      int iw_base = ow * wt_strides[2] - padding_lo[2];

      for (int o = 0; o < O; ++o) {
        float r = 0.;
@@ -478,7 +488,7 @@ void slow_conv_3D(
    std::vector<int> base_w(f_out_jump_w);

    for (int i = 0; i < f_out_jump_d; ++i) {
-      int id_loop = i * wt_strides[0] - padding[0] + init_d;
+      int id_loop = i * wt_strides[0] - padding_lo[0] + init_d;

      int wd_base = 0;
      while (wd_base < wD && id_loop % in_dilation[0] != 0) {
@@ -490,7 +500,7 @@ void slow_conv_3D(
    }

    for (int i = 0; i < f_out_jump_h; ++i) {
-      int ih_loop = i * wt_strides[1] - padding[1] + init_h;
+      int ih_loop = i * wt_strides[1] - padding_lo[1] + init_h;

      int wh_base = 0;
      while (wh_base < wH && ih_loop % in_dilation[1] != 0) {
@@ -502,7 +512,7 @@ void slow_conv_3D(
    }

    for (int j = 0; j < f_out_jump_w; ++j) {
-      int iw_loop = j * wt_strides[2] - padding[2] + init_w;
+      int iw_loop = j * wt_strides[2] - padding_lo[2] + init_w;

      int ww_base = 0;
      while (ww_base < wW && iw_loop % in_dilation[2] != 0) {
@@ -521,9 +531,9 @@ void slow_conv_3D(
                                  int ow) {
      out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;

-      int id_base = od * wt_strides[0] - padding[0];
-      int ih_base = oh * wt_strides[1] - padding[1];
-      int iw_base = ow * wt_strides[2] - padding[2];
+      int id_base = od * wt_strides[0] - padding_lo[0];
+      int ih_base = oh * wt_strides[1] - padding_lo[1];
+      int iw_base = ow * wt_strides[2] - padding_lo[2];

      int wd_base = base_d[od % f_out_jump_d];
      int wh_base = base_h[oh % f_out_jump_h];
@@ -573,24 +583,30 @@ void slow_conv_3D(
    };

    int oD_border_0 = 0;
-    int oD_border_1 =
-        is_idil_one ? ((padding[0] + wt_strides[0] - 1) / wt_strides[0]) : oD;
+    int oD_border_1 = is_idil_one
+        ? ((padding_lo[0] + wt_strides[0] - 1) / wt_strides[0])
+        : oD;
    int oD_border_2 = std::max(
-        oD_border_1, (iD + padding[0] - wD * wt_dilation[0]) / wt_strides[0]);
+        oD_border_1,
+        (iD + padding_lo[0] - wD * wt_dilation[0]) / wt_strides[0]);
    int oD_border_3 = oD;

    int oH_border_0 = 0;
-    int oH_border_1 =
-        is_idil_one ? ((padding[1] + wt_strides[1] - 1) / wt_strides[1]) : oH;
+    int oH_border_1 = is_idil_one
+        ? ((padding_lo[1] + wt_strides[1] - 1) / wt_strides[1])
+        : oH;
    int oH_border_2 = std::max(
-        oH_border_1, (iH + padding[1] - wH * wt_dilation[1]) / wt_strides[1]);
+        oH_border_1,
+        (iH + padding_lo[1] - wH * wt_dilation[1]) / wt_strides[1]);
    int oH_border_3 = oH;

    int oW_border_0 = 0;
-    int oW_border_1 =
-        is_idil_one ? ((padding[2] + wt_strides[2] - 1) / wt_strides[2]) : oW;
+    int oW_border_1 = is_idil_one
+        ? ((padding_lo[2] + wt_strides[2] - 1) / wt_strides[2])
+        : oW;
    int oW_border_2 = std::max(
-        oW_border_1, (iW + padding[2] - wW * wt_dilation[2]) / wt_strides[2]);
+        oW_border_1,
+        (iW + padding_lo[2] - wW * wt_dilation[2]) / wt_strides[2]);
    int oW_border_3 = oW;

    for (int n = 0; n < N; ++n) {
@@ -658,7 +674,8 @@ void dispatch_slow_conv_1D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@@ -669,7 +686,8 @@ void dispatch_slow_conv_1D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@@ -680,7 +698,8 @@ void dispatch_slow_conv_1D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@@ -691,7 +710,8 @@ void dispatch_slow_conv_1D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@@ -707,7 +727,8 @@ void dispatch_slow_conv_2D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@@ -718,7 +739,8 @@ void dispatch_slow_conv_2D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@@ -729,7 +751,8 @@ void dispatch_slow_conv_2D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@@ -740,7 +763,8 @@ void dispatch_slow_conv_2D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@@ -756,7 +780,8 @@ void dispatch_slow_conv_3D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@@ -767,7 +792,8 @@ void dispatch_slow_conv_3D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@@ -778,7 +804,8 @@ void dispatch_slow_conv_3D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@@ -789,7 +816,8 @@ void dispatch_slow_conv_3D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@@ -829,7 +857,8 @@ void explicit_gemm_conv_1D_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    Stream stream) {
@@ -848,7 +877,7 @@ void explicit_gemm_conv_1D_cpu(
  auto& encoder = cpu::get_command_encoder(stream);

  // Pad input
-  Shape padded_shape = {N, iH + 2 * padding[0], C};
+  Shape padded_shape = {N, iH + padding_lo[0] + padding_hi[0], C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
@@ -857,7 +886,7 @@ void explicit_gemm_conv_1D_cpu(
  copy(temps.back(), in_padded, CopyType::Scalar, stream);

  // Pick input slice from padded
-  size_t data_offset = padding[0] * in_padded.strides()[1];
+  size_t data_offset = padding_lo[0] * in_padded.strides()[1];
  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
@@ -971,7 +1000,8 @@ void explicit_gemm_conv_2D_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    Stream stream) {
@@ -989,7 +1019,11 @@ void explicit_gemm_conv_2D_cpu(
  auto& encoder = cpu::get_command_encoder(stream);

  // Pad input
-  Shape padded_shape = {N, iH + 2 * padding[0], iW + 2 * padding[1], C};
+  Shape padded_shape = {
+      N,
+      iH + padding_lo[0] + padding_hi[0],
+      iW + padding_lo[1] + padding_hi[1],
+      C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
@@ -998,8 +1032,8 @@ void explicit_gemm_conv_2D_cpu(
  copy(temps.back(), in_padded, CopyType::Scalar, stream);

  // Pick input slice from padded
-  size_t data_offset =
-      padding[0] * in_padded.strides()[1] + padding[1] * in_padded.strides()[2];
+  size_t data_offset = padding_lo[0] * in_padded.strides()[1] +
+      padding_lo[1] * in_padded.strides()[2];
  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
@@ -1091,7 +1125,8 @@ void explicit_gemm_conv_ND_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const bool flip,
@@ -1114,7 +1149,7 @@ void explicit_gemm_conv_ND_cpu(
  Shape padded_shape(in.shape().size());
  padded_shape.front() = N;
  for (size_t i = 0; i < iDim.size(); i++) {
-    padded_shape[i + 1] = iDim[i] + 2 * padding[i];
+    padded_shape[i + 1] = iDim[i] + padding_lo[i] + padding_hi[i];
  }
  padded_shape.back() = C;
  array in_padded(padded_shape, conv_dtype, nullptr, {});
@@ -1125,9 +1160,10 @@ void explicit_gemm_conv_ND_cpu(

  // Pick input slice from padded
  size_t data_offset = 0;
-  for (size_t i = 0; i < padding.size(); i++) {
-    data_offset += padding[i] * in_padded.strides()[i + 1];
+  for (size_t i = 0; i < padding_lo.size(); i++) {
+    data_offset += padding_lo[i] * in_padded.strides()[i + 1];
  }
+
  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
@@ -1261,7 +1297,8 @@ void conv_1D_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@@ -1270,22 +1307,40 @@ void conv_1D_cpu(
  const int groups = in.shape().back() / wt.shape().back();
  if (wt_dilation[0] == 1 && in_dilation[0] == 1 && !flip) {
    return explicit_gemm_conv_1D_cpu(
-        in, wt, out, padding, wt_strides, wt_dilation, stream);
+        in, wt, out, padding_lo, padding_hi, wt_strides, wt_dilation, stream);
  }
  if (wt_dilation[0] == 1 && in_dilation[0] == 1 && groups == 1) {
    return explicit_gemm_conv_ND_cpu(
-        in, wt, out, padding, wt_strides, wt_dilation, flip, stream);
+        in,
+        wt,
+        out,
+        padding_lo,
+        padding_hi,
+        wt_strides,
+        wt_dilation,
+        flip,
+        stream);
  }

  return dispatch_slow_conv_1D(
-      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip, stream);
+      in,
+      wt,
+      out,
+      padding_lo,
+      padding_hi,
+      wt_strides,
+      wt_dilation,
+      in_dilation,
+      flip,
+      stream);
 }

 void conv_2D_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@@ -1295,18 +1350,35 @@ void conv_2D_cpu(
  if (wt_dilation[0] == 1 && wt_dilation[1] == 1 && in_dilation[0] == 1 &&
      in_dilation[1] == 1 && groups == 1) {
    return explicit_gemm_conv_ND_cpu(
-        in, wt, out, padding, wt_strides, wt_dilation, flip, stream);
+        in,
+        wt,
+        out,
+        padding_lo,
+        padding_hi,
+        wt_strides,
+        wt_dilation,
+        flip,
+        stream);
  }
-
  return dispatch_slow_conv_2D(
-      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip, stream);
+      in,
+      wt,
+      out,
+      padding_lo,
+      padding_hi,
+      wt_strides,
+      wt_dilation,
+      in_dilation,
+      flip,
+      stream);
 }

 void conv_3D_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@@ -1317,11 +1389,28 @@ void conv_3D_cpu(
      in_dilation[0] == 1 && in_dilation[1] == 1 && in_dilation[2] == 1 &&
      groups == 1) {
    return explicit_gemm_conv_ND_cpu(
-        in, wt, out, padding, wt_strides, wt_dilation, flip, stream);
+        in,
+        wt,
+        out,
+        padding_lo,
+        padding_hi,
+        wt_strides,
+        wt_dilation,
+        flip,
+        stream);
  }

  return dispatch_slow_conv_3D(
-      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip, stream);
+      in,
+      wt,
+      out,
+      padding_lo,
+      padding_hi,
+      wt_strides,
+      wt_dilation,
+      in_dilation,
+      flip,
+      stream);
 }

 } // namespace
@@ -1338,7 +1427,8 @@ void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
        in,
        wt,
        out,
-        padding_,
+        padding_lo_,
+        padding_hi_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
@@ -1351,7 +1441,8 @@ void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
        in,
        wt,
        out,
-        padding_,
+        padding_lo_,
+        padding_hi_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
@@ -1364,7 +1455,8 @@ void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
        in,
        wt,
        out,
-        padding_,
+        padding_lo_,
+        padding_hi_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
--- a/mlx/backend/cpu/eig.cpp
+++ b/mlx/backend/cpu/eig.cpp
@@ -0,0 +1,174 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/allocator.h"
+#include "mlx/array.h"
+#include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
+#include "mlx/backend/cpu/lapack.h"
+#include "mlx/linalg.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T>
+void eig_impl(
+    array& a,
+    array& vectors,
+    array& values,
+    bool compute_eigenvectors,
+    Stream stream) {
+  using OT = std::complex<T>;
+  auto a_ptr = a.data<T>();
+  auto eig_ptr = values.data<OT>();
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_output_array(values);
+  OT* vec_ptr = nullptr;
+  if (compute_eigenvectors) {
+    encoder.set_output_array(vectors);
+    vec_ptr = vectors.data<OT>();
+  }
+  encoder.dispatch([a_ptr,
+                    vec_ptr,
+                    eig_ptr,
+                    compute_eigenvectors,
+                    N = vectors.shape(-1),
+                    size = vectors.size()]() mutable {
+    // Work query
+    char jobr = 'N';
+    char jobl = compute_eigenvectors ? 'V' : 'N';
+    int n_vecs_r = 1;
+    int n_vecs_l = compute_eigenvectors ? N : 1;
+    int lwork = -1;
+    int info;
+    {
+      T work;
+      int iwork;
+      geev<T>(
+          &jobl,
+          &jobr,
+          &N,
+          nullptr,
+          &N,
+          nullptr,
+          nullptr,
+          nullptr,
+          &n_vecs_l,
+          nullptr,
+          &n_vecs_r,
+          &work,
+          &lwork,
+          &info);
+      lwork = static_cast<int>(work);
+    }
+
+    auto eig_tmp_data = array::Data{allocator::malloc(sizeof(T) * N * 2)};
+    auto vec_tmp_data =
+        array::Data{allocator::malloc(vec_ptr ? sizeof(T) * N * N * 2 : 0)};
+    auto eig_tmp = static_cast<T*>(eig_tmp_data.buffer.raw_ptr());
+    auto vec_tmp = static_cast<T*>(vec_tmp_data.buffer.raw_ptr());
+    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
+    for (size_t i = 0; i < size / (N * N); ++i) {
+      geev<T>(
+          &jobl,
+          &jobr,
+          &N,
+          a_ptr,
+          &N,
+          eig_tmp,
+          eig_tmp + N,
+          vec_tmp,
+          &n_vecs_l,
+          nullptr,
+          &n_vecs_r,
+          static_cast<T*>(work_buf.buffer.raw_ptr()),
+          &lwork,
+          &info);
+      for (int i = 0; i < N; ++i) {
+        eig_ptr[i] = {eig_tmp[i], eig_tmp[N + i]};
+      }
+      if (vec_ptr) {
+        for (int i = 0; i < N; ++i) {
+          if (eig_ptr[i].imag() != 0) {
+            // This vector and the next are a pair
+            for (int j = 0; j < N; ++j) {
+              vec_ptr[i * N + j] = {
+                  vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]};
+              vec_ptr[(i + 1) * N + j] = {
+                  vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]};
+            }
+            i += 1;
+          } else {
+            for (int j = 0; j < N; ++j) {
+              vec_ptr[i * N + j] = {vec_tmp[i * N + j], 0};
+            }
+          }
+        }
+        vec_ptr += N * N;
+      }
+      a_ptr += N * N;
+      eig_ptr += N;
+      if (info != 0) {
+        std::stringstream msg;
+        msg << "[Eig::eval_cpu] Eigenvalue decomposition failed with error code "
+            << info;
+        throw std::runtime_error(msg.str());
+      }
+    }
+  });
+  encoder.add_temporary(a);
+}
+
+} // namespace
+
+void Eig::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  const auto& a = inputs[0];
+  auto& values = outputs[0];
+
+  auto vectors = compute_eigenvectors_
+      ? outputs[1]
+      : array(a.shape(), complex64, nullptr, {});
+
+  auto a_copy = array(a.shape(), a.dtype(), nullptr, {});
+  copy(
+      a,
+      a_copy,
+      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
+      stream());
+
+  values.set_data(allocator::malloc(values.nbytes()));
+
+  if (compute_eigenvectors_) {
+    // Set the strides and flags so the eigenvectors
+    // are in the columns of the output
+    auto flags = vectors.flags();
+    auto strides = vectors.strides();
+    auto ndim = a.ndim();
+    std::swap(strides[ndim - 1], strides[ndim - 2]);
+
+    if (a.size() > 1) {
+      flags.row_contiguous = false;
+      if (ndim > 2) {
+        flags.col_contiguous = false;
+      } else {
+        flags.col_contiguous = true;
+      }
+    }
+    vectors.set_data(
+        allocator::malloc(vectors.nbytes()), vectors.size(), strides, flags);
+  }
+  switch (a.dtype()) {
+    case float32:
+      eig_impl<float>(a_copy, vectors, values, compute_eigenvectors_, stream());
+      break;
+    default:
+      throw std::runtime_error("[Eig::eval_cpu] only supports float32.");
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/eigh.cpp
+++ b/mlx/backend/cpu/eigh.cpp
@@ -12,6 +12,133 @@ namespace mlx::core {

 namespace {

+template <typename T, class Enable = void>
+struct EighWork {};
+
+template <typename T>
+struct EighWork<
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  using R = T;
+
+  char jobz;
+  char uplo;
+  int N;
+  int lwork;
+  int liwork;
+  int info;
+  std::vector<array::Data> buffers;
+
+  EighWork(char jobz_, char uplo_, int N_)
+      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), liwork(-1) {
+    T work;
+    int iwork;
+    syevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        nullptr,
+        &N,
+        nullptr,
+        &work,
+        &lwork,
+        &iwork,
+        &liwork,
+        &info);
+    lwork = static_cast<int>(work);
+    liwork = iwork;
+    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
+    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
+  }
+
+  void run(T* vectors, T* values) {
+    syevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        vectors,
+        &N,
+        values,
+        static_cast<T*>(buffers[0].buffer.raw_ptr()),
+        &lwork,
+        static_cast<int*>(buffers[1].buffer.raw_ptr()),
+        &liwork,
+        &info);
+  }
+};
+
+template <>
+struct EighWork<std::complex<float>> {
+  using T = std::complex<float>;
+  using R = float;
+
+  char jobz;
+  char uplo;
+  int N;
+  int lwork;
+  int lrwork;
+  int liwork;
+  int info;
+  std::vector<array::Data> buffers;
+
+  EighWork(char jobz_, char uplo_, int N_)
+      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), lrwork(-1), liwork(-1) {
+    T work;
+    R rwork;
+    int iwork;
+    heevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        nullptr,
+        &N,
+        nullptr,
+        &work,
+        &lwork,
+        &rwork,
+        &lrwork,
+        &iwork,
+        &liwork,
+        &info);
+    lwork = static_cast<int>(work.real());
+    lrwork = static_cast<int>(rwork);
+    liwork = iwork;
+    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
+    buffers.emplace_back(allocator::malloc(sizeof(R) * lrwork));
+    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
+  }
+
+  void run(T* vectors, R* values) {
+    heevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        vectors,
+        &N,
+        values,
+        static_cast<T*>(buffers[0].buffer.raw_ptr()),
+        &lwork,
+        static_cast<R*>(buffers[1].buffer.raw_ptr()),
+        &lrwork,
+        static_cast<int*>(buffers[2].buffer.raw_ptr()),
+        &liwork,
+        &info);
+    if (jobz == 'V') {
+      // We have pre-transposed the vectors but we also must conjugate them
+      // when they are complex.
+      //
+      // We could vectorize this but it is so fast in comparison to heevd that
+      // it doesn't really matter.
+      for (int i = 0; i < N; i++) {
+        for (int j = 0; j < N; j++) {
+          *vectors = std::conj(*vectors);
+          vectors++;
+        }
+      }
+    }
+  }
+};
+
 template <typename T>
 void eigh_impl(
    array& vectors,
@@ -19,8 +146,10 @@ void eigh_impl(
    const std::string& uplo,
    bool compute_eigenvectors,
    Stream stream) {
+  using R = typename EighWork<T>::R;
+
  auto vec_ptr = vectors.data<T>();
-  auto eig_ptr = values.data<T>();
+  auto eig_ptr = values.data<R>();
  char jobz = compute_eigenvectors ? 'V' : 'N';

  auto& encoder = cpu::get_command_encoder(stream);
@@ -33,49 +162,17 @@ void eigh_impl(
                    N = vectors.shape(-1),
                    size = vectors.size()]() mutable {
    // Work query
-    int lwork = -1;
-    int liwork = -1;
-    int info;
-    {
-      T work;
-      int iwork;
-      syevd<T>(
-          &jobz,
-          &uplo,
-          &N,
-          nullptr,
-          &N,
-          nullptr,
-          &work,
-          &lwork,
-          &iwork,
-          &liwork,
-          &info);
-      lwork = static_cast<int>(work);
-      liwork = iwork;
-    }
+    EighWork<T> work(jobz, uplo, N);

-    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
-    auto iwork_buf = array::Data{allocator::malloc(sizeof(int) * liwork)};
+    // Work loop
    for (size_t i = 0; i < size / (N * N); ++i) {
-      syevd<T>(
-          &jobz,
-          &uplo,
-          &N,
-          vec_ptr,
-          &N,
-          eig_ptr,
-          static_cast<T*>(work_buf.buffer.raw_ptr()),
-          &lwork,
-          static_cast<int*>(iwork_buf.buffer.raw_ptr()),
-          &liwork,
-          &info);
+      work.run(vec_ptr, eig_ptr);
      vec_ptr += N * N;
      eig_ptr += N;
-      if (info != 0) {
+      if (work.info != 0) {
        std::stringstream msg;
        msg << "[Eigh::eval_cpu] Eigenvalue decomposition failed with error code "
-            << info;
+            << work.info;
        throw std::runtime_error(msg.str());
      }
    }
@@ -131,6 +228,10 @@ void Eigh::eval_cpu(
      eigh_impl<double>(
          vectors, values, uplo_, compute_eigenvectors_, stream());
      break;
+    case complex64:
+      eigh_impl<std::complex<float>>(
+          vectors, values, uplo_, compute_eigenvectors_, stream());
+      break;
    default:
      throw std::runtime_error(
          "[Eigh::eval_cpu] only supports float32 or float64.");
--- a/mlx/backend/cpu/indexing.cpp
+++ b/mlx/backend/cpu/indexing.cpp
@@ -257,15 +257,11 @@ void gather_axis(
    const array& ind,
    array& out,
    const int axis) {
-  auto strides = ind.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = ind.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator ind_it(shape, strides, src.ndim() - 1);
-
-  strides = src.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator src_it(shape, strides, src.ndim() - 1);
+  auto shape = remove_index(ind.shape(), axis);
+  ContiguousIterator ind_it(
+      shape, remove_index(ind.strides(), axis), src.ndim() - 1);
+  ContiguousIterator src_it(
+      shape, remove_index(src.strides(), axis), src.ndim() - 1);

  auto ind_ptr = ind.data<IdxT>();
  auto src_ptr = src.data<T>();
@@ -585,15 +581,11 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {

 template <typename T, typename IdxT, typename OpT>
 void scatter_axis(array& out, const array idx, const array& upd, int axis) {
-  auto strides = idx.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = idx.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator idx_it(shape, strides, upd.ndim() - 1);
-
-  strides = upd.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator upd_it(shape, strides, upd.ndim() - 1);
+  auto shape = remove_index(idx.shape(), axis);
+  ContiguousIterator idx_it(
+      shape, remove_index(idx.strides(), axis), upd.ndim() - 1);
+  ContiguousIterator upd_it(
+      shape, remove_index(upd.strides(), axis), upd.ndim() - 1);

  auto idx_ptr = idx.data<IdxT>();
  auto upd_ptr = upd.data<T>();
--- a/mlx/backend/cpu/lapack.h
+++ b/mlx/backend/cpu/lapack.h
@@ -2,14 +2,14 @@

 #pragma once

-// Required for Visual Studio.
-// https://github.com/OpenMathLib/OpenBLAS/blob/develop/docs/install.md
-#ifdef _MSC_VER
 #include <complex>
 #define LAPACK_COMPLEX_CUSTOM
 #define lapack_complex_float std::complex<float>
 #define lapack_complex_double std::complex<double>
-#endif
+#define lapack_complex_float_real(z) ((z).real())
+#define lapack_complex_float_imag(z) ((z).imag())
+#define lapack_complex_double_real(z) ((z).real())
+#define lapack_complex_double_imag(z) ((z).imag())

 #ifdef MLX_USE_ACCELERATE
 #include <Accelerate/Accelerate.h>
@@ -32,7 +32,7 @@

 #endif

-#define INSTANTIATE_LAPACK_TYPES(FUNC)                       \
+#define INSTANTIATE_LAPACK_REAL(FUNC)                        \
  template <typename T, typename... Args>                    \
  void FUNC(Args... args) {                                  \
    if constexpr (std::is_same_v<T, float>) {                \
@@ -42,11 +42,24 @@
    }                                                        \
  }

-INSTANTIATE_LAPACK_TYPES(geqrf)
-INSTANTIATE_LAPACK_TYPES(orgqr)
-INSTANTIATE_LAPACK_TYPES(syevd)
-INSTANTIATE_LAPACK_TYPES(potrf)
-INSTANTIATE_LAPACK_TYPES(gesvdx)
-INSTANTIATE_LAPACK_TYPES(getrf)
-INSTANTIATE_LAPACK_TYPES(getri)
-INSTANTIATE_LAPACK_TYPES(trtri)
+INSTANTIATE_LAPACK_REAL(geqrf)
+INSTANTIATE_LAPACK_REAL(orgqr)
+INSTANTIATE_LAPACK_REAL(syevd)
+INSTANTIATE_LAPACK_REAL(geev)
+INSTANTIATE_LAPACK_REAL(potrf)
+INSTANTIATE_LAPACK_REAL(gesvdx)
+INSTANTIATE_LAPACK_REAL(getrf)
+INSTANTIATE_LAPACK_REAL(getri)
+INSTANTIATE_LAPACK_REAL(trtri)
+
+#define INSTANTIATE_LAPACK_COMPLEX(FUNC)                            \
+  template <typename T, typename... Args>                           \
+  void FUNC(Args... args) {                                         \
+    if constexpr (std::is_same_v<T, std::complex<float>>) {         \
+      MLX_LAPACK_FUNC(c##FUNC)(std::forward<Args>(args)...);        \
+    } else if constexpr (std::is_same_v<T, std::complex<double>>) { \
+      MLX_LAPACK_FUNC(z##FUNC)(std::forward<Args>(args)...);        \
+    }                                                               \
+  }
+
+INSTANTIATE_LAPACK_COMPLEX(heevd)
--- a/mlx/backend/cpu/masked_mm.cpp
+++ b/mlx/backend/cpu/masked_mm.cpp
@@ -6,6 +6,7 @@
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
+#include "mlx/backend/cpu/gemm.h"
 #include "mlx/backend/cpu/lapack.h"
 #include "mlx/primitives.h"

@@ -52,6 +53,58 @@ inline void mask_matrix(
  }
 }

+template <typename T>
+inline void segmented_mm(
+    const T* a,
+    const T* b,
+    const uint32_t* segments,
+    T* out,
+    bool a_transposed,
+    bool b_transposed,
+    size_t lda,
+    size_t ldb,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides,
+    size_t num_segments,
+    const Shape& segments_shape,
+    const Strides& segments_strides) {
+  int ndim = a_shape.size();
+  Shape a_copy = a_shape;
+  Shape b_copy = b_shape;
+  int32_t M = a_copy[ndim - 2];
+  int32_t N = b_copy[ndim - 1];
+  for (int i = 0; i < num_segments; i++) {
+    uint32_t k_start =
+        segments[elem_to_loc(2 * i, segments_shape, segments_strides)];
+    uint32_t k_end =
+        segments[elem_to_loc(2 * i + 1, segments_shape, segments_strides)];
+    if (k_end <= k_start) {
+      std::fill_n(out + i * M * N, M * N, T(0));
+      continue;
+    }
+    a_copy[ndim - 1] = k_end - k_start;
+    b_copy[ndim - 2] = k_end - k_start;
+    matmul<T>(
+        a + k_start * a_strides[ndim - 1],
+        b + k_start * b_strides[ndim - 2],
+        out + i * M * N,
+        a_transposed,
+        b_transposed,
+        lda,
+        ldb,
+        N,
+        1.0,
+        0.0,
+        1,
+        a_copy,
+        a_strides,
+        b_copy,
+        b_strides);
+  }
+}
+
 } // namespace

 void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -437,4 +490,121 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  encoder.add_temporaries(std::move(temps));
 }

+void SegmentedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  auto& s = stream();
+  auto& encoder = cpu::get_command_encoder(stream());
+  auto check_transpose = [&s, &encoder](const array& x) {
+    auto stx = x.strides()[x.ndim() - 2];
+    auto sty = x.strides()[x.ndim() - 1];
+    if (stx == x.shape(-1) && sty == 1) {
+      return std::make_tuple(false, stx, x);
+    } else if (stx == 1 && sty == x.shape(-2)) {
+      return std::make_tuple(true, sty, x);
+    } else {
+      array xc(x.shape(), x.dtype(), nullptr, {});
+      copy(x, xc, CopyType::General, s);
+      encoder.add_temporary(xc);
+      int64_t stx = x.shape(-1);
+      return std::make_tuple(false, stx, xc);
+    }
+  };
+
+  auto [a_transposed, lda, a] = check_transpose(inputs[0]);
+  auto [b_transposed, ldb, b] = check_transpose(inputs[1]);
+  auto& segments = inputs[2];
+
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_input_array(segments);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    segments = array::unsafe_weak_copy(segments),
+                    out_ptr = out.data<void>(),
+                    a_transposed = a_transposed,
+                    b_transposed = b_transposed,
+                    lda = lda,
+                    ldb = ldb]() {
+    switch (a.dtype()) {
+      case float64:
+        segmented_mm<double>(
+            a.data<double>(),
+            b.data<double>(),
+            segments.data<uint32_t>(),
+            static_cast<double*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      case float32:
+        segmented_mm<float>(
+            a.data<float>(),
+            b.data<float>(),
+            segments.data<uint32_t>(),
+            static_cast<float*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      case float16:
+        segmented_mm<float16_t>(
+            a.data<float16_t>(),
+            b.data<float16_t>(),
+            segments.data<uint32_t>(),
+            static_cast<float16_t*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      case bfloat16:
+        segmented_mm<bfloat16_t>(
+            a.data<bfloat16_t>(),
+            b.data<bfloat16_t>(),
+            segments.data<uint32_t>(),
+            static_cast<bfloat16_t*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      default:
+        throw std::invalid_argument(
+            "Segmented mm supports only real float types.");
+    }
+  });
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/matmul.cpp
+++ b/mlx/backend/cpu/matmul.cpp
@@ -132,6 +132,10 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
    throw std::runtime_error(
        "[AddMM::eval_cpu] Currently only supports float32.");
  }
+  if (out.size() == 0) {
+    out.set_data(allocator::malloc(out.nbytes()));
+    return;
+  }

  // Fill output with C
  auto& c = inputs[2];
@@ -139,7 +143,9 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
      ? CopyType::Scalar
      : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  copy(c, out, ctype, stream());
-
+  if (inputs[0].shape(-1) == 0) {
+    return;
+  }
  matmul_general(inputs[0], inputs[1], out, stream(), alpha_, beta_);
 }

--- a/mlx/backend/cpu/quantized.cpp
+++ b/mlx/backend/cpu/quantized.cpp
@@ -13,9 +13,18 @@ namespace mlx::core {

 namespace {

+inline constexpr short get_pack_factor(int bits, int wsize = 8) {
+  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
+}
+
+inline constexpr short get_bytes_per_pack(int bits, int wsize = 8) {
+  auto power_of_2_bits = (bits & (bits - 1)) == 0;
+  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
+}
+
 template <typename T, int bits>
 void extract_bits(const uint8_t* w_in, T* w_out) {
-  assert(bits == 3 || bits == 6);
+  static_assert(bits == 3 || bits == 5 || bits == 6);
  if (bits == 3) {
    w_out[0] = static_cast<T>(w_in[0] & 0x7);
    w_out[1] = static_cast<T>((w_in[0] & 0x38) >> 3);
@@ -25,6 +34,16 @@ void extract_bits(const uint8_t* w_in, T* w_out) {
    w_out[5] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0x3) << 1));
    w_out[6] = static_cast<T>((w_in[2] & 0x1c) >> 2);
    w_out[7] = static_cast<T>((w_in[2] & 0xe0) >> 5);
+  } else if (bits == 5) {
+    w_out[0] = static_cast<T>(w_in[0] & 0x1f);
+    w_out[1] = static_cast<T>(((w_in[0] & 0xe0) >> 5) + ((w_in[1] & 0x3) << 3));
+    w_out[2] = static_cast<T>((w_in[1] & 0x7c) >> 2);
+    w_out[3] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0xf) << 1));
+    w_out[4] = static_cast<T>(((w_in[2] & 0xf0) >> 4) + ((w_in[3] & 0x1) << 4));
+    w_out[5] = static_cast<T>((w_in[3] & 0x3e) >> 1);
+    w_out[6] = static_cast<T>(((w_in[3] & 0xc0) >> 6) + ((w_in[4] & 0x7) << 2));
+    w_out[7] = static_cast<T>((w_in[4] & 0xf8) >> 3);
+
  } else if (bits == 6) {
    w_out[0] = static_cast<T>(w_in[0] & 0x3f);
    w_out[1] =
@@ -46,8 +65,8 @@ void _qmm(
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
-  constexpr int pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
-  constexpr int bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;
+  constexpr int pack_factor = get_pack_factor(bits, 8);
+  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
@@ -65,7 +84,7 @@ void _qmm(
        T scale = *scales_local++;
        T bias = *biases_local++;
        for (int ng = 0; ng < packs_in_group; ng++) {
-          if (bits == 3 || bits == 6) {
+          if constexpr (bits == 3 || bits == 5 || bits == 6) {
            T wl[pack_factor];
            extract_bits<T, bits>(w_local, wl);
 #pragma clang loop unroll(full)
@@ -104,8 +123,9 @@ void _qmm_t(
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
-  constexpr int pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
-  constexpr int bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;
+
+  constexpr int pack_factor = get_pack_factor(bits, 8);
+  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
@@ -121,7 +141,7 @@ void _qmm_t(
        T bias = *biases_local++;

        for (int kw = 0; kw < packs_in_group; kw++) {
-          if (bits == 3 || bits == 6) {
+          if constexpr (bits == 3 || bits == 5 || bits == 6) {
            T wl[pack_factor];
            extract_bits<T, bits>(w_local, wl);
 #pragma clang loop unroll(full)
@@ -304,6 +324,10 @@ void _qmm_dispatch_typed(
      _qmm_dispatch_group<T, 4>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
      break;
+    case 5:
+      _qmm_dispatch_group<T, 5>(
+          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
+      break;
    case 6:
      _qmm_dispatch_group<T, 6>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
@@ -613,9 +637,8 @@ void quantize(
  float eps = 1e-7;

  bool power_of_2_bits = is_power_of_2(bits);
-  int el_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;
-  // For 3/6 bits we read 3 uint8s at a time instead of 1 uint32
-  int bytes_per_pack = power_of_2_bits ? 1 : 3;
+  int el_per_int = get_pack_factor(bits, 32);
+  int bytes_per_pack = get_bytes_per_pack(bits);
  int int_per_group = group_size * bytes_per_pack / el_per_int;
  size_t n_groups = w_size / group_size;

@@ -640,15 +663,21 @@ void quantize(
    }
    size_t out_idx = i * int_per_group;
    for (int j = 0; j < int_per_group / bytes_per_pack; ++j) {
-      uint32_t out_el = 0;
+      uint64_t out_el = 0;
      for (int k = 0; k < el_per_int; ++k) {
        float w_el = w[w_idx + j * el_per_int + k];
        w_el = std::rint((w_el - bias) / scale);
        w_el = std::min(std::max(w_el, 0.0f), n_bins);
-        out_el |= static_cast<uint32_t>(w_el) << (k * bits);
+        out_el |= static_cast<uint64_t>(w_el) << (k * bits);
      }
      if (power_of_2_bits) {
        out[out_idx + j] = out_el;
+      } else if (bits == 5) {
+        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
+        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
+        out[out_idx + bytes_per_pack * j + 2] = (out_el & 0xff0000) >> 16;
+        out[out_idx + bytes_per_pack * j + 3] = (out_el & 0xff000000) >> 24;
+        out[out_idx + bytes_per_pack * j + 4] = (out_el & 0xff00000000) >> 32;
      } else {
        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
--- a/mlx/backend/cpu/reduce.cpp
+++ b/mlx/backend/cpu/reduce.cpp
@@ -325,7 +325,15 @@ struct MaxReduce {
  };

  template <int N, typename T>
-  T operator()(simd::Simd<T, N> x) {
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    return simd::max(x);
+  };
+
+  template <int N, typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    if (simd::any(x != x)) {
+      return static_cast<T>(NAN);
+    }
    return simd::max(x);
  };
 };
@@ -342,7 +350,15 @@ struct MinReduce {
  };

  template <int N, typename T>
-  T operator()(simd::Simd<T, N> x) {
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    return simd::min(x);
+  };
+
+  template <int N, typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    if (simd::any(x != x)) {
+      return static_cast<T>(NAN);
+    }
    return simd::min(x);
  };
 };
@@ -527,10 +543,10 @@ void Reduce::eval_cpu(const std::vector<array>& inputs, array& out) {
            reduce_dispatch_min_max<uint64_t>(in, out, reduce_type_, axes_);
            break;
          case int8:
-            reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
+            reduce_dispatch_min_max<int8_t>(in, out, reduce_type_, axes_);
            break;
          case int16:
-            reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
+            reduce_dispatch_min_max<int16_t>(in, out, reduce_type_, axes_);
            break;
          case int32:
            reduce_dispatch_min_max<int32_t>(in, out, reduce_type_, axes_);
--- a/mlx/backend/cpu/scan.cpp
+++ b/mlx/backend/cpu/scan.cpp
@@ -330,7 +330,8 @@ void Scan::eval_cpu(const std::vector<array>& inputs, array& out) {
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case complex64:
-        throw std::runtime_error("Scan ops do not support complex types yet");
+        scan_dispatch<complex64_t, complex64_t>(
+            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
    }
  });
--- a/mlx/backend/cpu/simd/base_simd.h
+++ b/mlx/backend/cpu/simd/base_simd.h
@@ -88,12 +88,33 @@ DEFAULT_UNARY(expm1, std::expm1)
 DEFAULT_UNARY(floor, std::floor)
 DEFAULT_UNARY(log, std::log)
 DEFAULT_UNARY(log10, std::log10)
-DEFAULT_UNARY(log1p, std::log1p)
 DEFAULT_UNARY(sinh, std::sinh)
 DEFAULT_UNARY(sqrt, std::sqrt)
 DEFAULT_UNARY(tan, std::tan)
 DEFAULT_UNARY(tanh, std::tanh)

+template <typename T>
+Simd<T, 1> log1p(Simd<T, 1> in) {
+  if constexpr (is_complex<T>) {
+    auto x = in.value.real();
+    auto y = in.value.imag();
+    auto zabs = std::abs(in.value);
+    auto theta = std::atan2(y, x + 1);
+    if (zabs < 0.5) {
+      auto r = x * (2 + x) + y * y;
+      if (r == 0) { // handle underflow
+        return Simd<T, 1>{T{x, theta}};
+      }
+      return Simd<T, 1>{T{((typeof(x))(0.5)) * std::log1p(r), theta}};
+    } else {
+      auto z0 = std::hypot(x + 1, y);
+      return Simd<T, 1>{T{std::log(z0), theta}};
+    }
+  } else {
+    return Simd<T, 1>{std::log1p(in.value)};
+  }
+}
+
 template <typename T>
 Simd<T, 1> log2(Simd<T, 1> in) {
  if constexpr (is_complex<T>) {
--- a/mlx/backend/cpu/unary.h
+++ b/mlx/backend/cpu/unary.h
@@ -2,32 +2,13 @@

 #pragma once

-#include "mlx/allocator.h"
-#include "mlx/array.h"
-#include "mlx/backend/common/utils.h"
+#include "mlx/backend/common/unary.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"
 #include "mlx/utils.h"

 namespace mlx::core {

-void set_unary_output_data(const array& in, array& out) {
-  if (in.flags().contiguous) {
-    if (is_donatable(in, out)) {
-      out.copy_shared_buffer(in);
-    } else {
-      auto size = in.data_size();
-      out.set_data(
-          allocator::malloc(size * out.itemsize()),
-          size,
-          in.strides(),
-          in.flags());
-    }
-  } else {
-    out.set_data(allocator::malloc(out.nbytes()));
-  }
-}
-
 template <typename T, typename U = T, typename Op>
 void unary_op(const T* a, U* out, size_t shape, size_t stride) {
  for (size_t i = 0; i < shape; i += 1) {
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -0,0 +1,127 @@
+# Filename rules in cuda backend:
+#
+# * Use .cu/.cuh if code contains device code, and .cpp/.h if not.
+# * Device-only code should be put in device/ subdir.
+# * Files in device/ subdir should not include files outside.
+target_sources(
+  mlx
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_contiguous.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_dynamic.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_input.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/cuda.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/jit_module.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/layer_norm.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/random.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/all_reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/col_reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/init_reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/row_reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
+
+target_compile_definitions(mlx PRIVATE MLX_USE_CUDA)
+
+# Embed kernel sources in binary for JIT compilation.
+file(
+  GLOB MLX_JIT_SOURCES
+  RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+  "${CMAKE_CURRENT_SOURCE_DIR}/device/*.h"
+  "${CMAKE_CURRENT_SOURCE_DIR}/device/*.cuh")
+string(JOIN ":" MLX_JIT_SOURCES_ARG ${MLX_JIT_SOURCES})
+add_custom_command(
+  OUTPUT gen/cuda_jit_sources.h
+  COMMAND
+    ${CMAKE_COMMAND} -DMLX_SOURCE_ROOT=${CMAKE_CURRENT_SOURCE_DIR}
+    -DMLX_JIT_SOURCES=${MLX_JIT_SOURCES_ARG} -P
+    "${CMAKE_CURRENT_SOURCE_DIR}/bin2h.cmake"
+  DEPENDS bin2h.cmake ${MLX_JIT_SOURCES})
+add_custom_target(cuda_jit_sources DEPENDS gen/cuda_jit_sources.h)
+add_dependencies(mlx cuda_jit_sources)
+target_include_directories(mlx PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/gen")
+
+# Enable defining device lambda functions.
+target_compile_options(mlx
+                       PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>")
+
+# Enable calling host constexpr functions from device. This is needed because
+# the constexpr version of isnan is host only.
+target_compile_options(
+  mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>")
+
+# CUDA 12.8 emits warning #20280-D for copy kernels which is a false positive.
+# Explicitly pass this flag to suppress the warning, it is safe to set it to
+# true but the warning wouldn't be suppressed.
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
+  target_compile_options(
+    mlx
+    PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
+endif()
+
+# Suppress warning when building for compute capability 7 used by V100.
+target_compile_options(
+  mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--Wno-deprecated-gpu-targets>")
+
+# Compute capability 7 is required for synchronization between CPU/GPU with
+# managed memory. TODO: Add more architectures for potential performance gain.
+set(MLX_CUDA_ARCHITECTURES
+    "70;80"
+    CACHE STRING "CUDA architectures")
+message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
+set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
+                                     "${MLX_CUDA_ARCHITECTURES}")
+
+# Use fixed version of CCCL.
+FetchContent_Declare(
+  cccl
+  URL "https://github.com/NVIDIA/cccl/releases/download/v2.8.1/cccl-v2.8.1.zip")
+FetchContent_MakeAvailable(cccl)
+target_include_directories(mlx BEFORE PRIVATE "${cccl_SOURCE_DIR}/include")
+
+# Use fixed version of NVTX.
+FetchContent_Declare(
+  nvtx3
+  GIT_REPOSITORY https://github.com/NVIDIA/NVTX.git
+  GIT_TAG v3.1.1
+  GIT_SHALLOW TRUE
+  SOURCE_SUBDIR c EXCLUDE_FROM_ALL)
+FetchContent_MakeAvailable(nvtx3)
+target_link_libraries(mlx PUBLIC $<BUILD_INTERFACE:nvtx3-cpp>)
+
+# Make cuda runtime APIs available in non-cuda files.
+find_package(CUDAToolkit REQUIRED)
+target_include_directories(mlx PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+
+# Use cublasLt.
+target_link_libraries(mlx PRIVATE CUDA::cublasLt)
+
+# Use NVRTC and driver APIs.
+target_link_libraries(mlx PRIVATE CUDA::nvrtc CUDA::cuda_driver)
+
+# Suppress nvcc warnings on MLX headers.
+target_compile_options(mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe
+                                   --diag_suppress=997>)
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -0,0 +1,215 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/allocator.h"
+#include "mlx/backend/cuda/utils.h"
+#include "mlx/backend/cuda/worker.h"
+#include "mlx/utils.h"
+
+#include <cuda_runtime.h>
+#include <fmt/format.h>
+#include <unistd.h>
+
+#include <cassert>
+
+namespace mlx::core {
+
+namespace cu {
+
+constexpr int page_size = 16384;
+
+CudaAllocator::CudaAllocator()
+    : buffer_cache_(
+          page_size,
+          [](CudaBuffer* buf) { return buf->size; },
+          [this](CudaBuffer* buf) {
+            cuda_free(buf->data);
+            delete buf;
+          }) {
+  // TODO: Set memory limit for multi-device.
+  size_t free, total;
+  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
+  memory_limit_ = total * 0.8;
+  max_pool_size_ = memory_limit_;
+}
+
+Buffer CudaAllocator::malloc(size_t size) {
+  // Find available buffer from cache.
+  auto orig_size = size;
+  std::unique_lock lock(mutex_);
+  if (size < page_size) {
+    size = next_power_of_2(size);
+  } else {
+    size = page_size * ((size + page_size - 1) / page_size);
+  }
+
+  CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
+  if (!buf) {
+    // If we have a lot of memory pressure or are over the maximum cache size,
+    // try to reclaim memory from the cache.
+    size_t mem_required = get_active_memory() + get_cache_memory() + size;
+    if (mem_required >= memory_limit_) {
+      buffer_cache_.release_cached_buffers(mem_required - memory_limit_);
+    }
+
+    lock.unlock();
+    buf = new CudaBuffer{nullptr, size};
+    cudaError_t err = cudaMallocManaged(&buf->data, size);
+    if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
+      throw std::runtime_error(fmt::format(
+          "cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
+    }
+    lock.lock();
+  }
+  active_memory_ += size;
+  peak_memory_ = std::max(active_memory_, peak_memory_);
+
+  // Maintain the cache below the requested limit.
+  if (get_cache_memory() > max_pool_size_) {
+    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
+  }
+
+  return Buffer{buf};
+}
+
+void CudaAllocator::free(Buffer buffer) {
+  auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
+  if (!buf) {
+    return;
+  }
+
+  std::unique_lock lock(mutex_);
+  active_memory_ -= buf->size;
+  if (get_cache_memory() < max_pool_size_) {
+    buffer_cache_.recycle_to_cache(buf);
+  } else {
+    lock.unlock();
+    cuda_free(buf->data);
+    delete buf;
+  }
+}
+
+size_t CudaAllocator::size(Buffer buffer) const {
+  auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
+  if (!buf) {
+    return 0;
+  }
+  return buf->size;
+}
+
+void CudaAllocator::register_this_thread() {
+  std::lock_guard lock(worker_mutex_);
+  allowed_threads_.insert(std::this_thread::get_id());
+}
+
+void CudaAllocator::cuda_free(void* buf) {
+  // If cuda_free() is called from a unregistered thread, reschedule the call to
+  // worker.
+  {
+    std::lock_guard lock(worker_mutex_);
+    if (allowed_threads_.count(std::this_thread::get_id()) == 0) {
+      if (!worker_) {
+        worker_.reset(new Worker);
+      }
+      worker_->add_task([this, buf]() { this->cuda_free(buf); });
+      worker_->end_batch();
+      worker_->commit();
+      return;
+    }
+  }
+  cudaFree(buf);
+}
+
+size_t CudaAllocator::get_active_memory() const {
+  return active_memory_;
+}
+
+size_t CudaAllocator::get_peak_memory() const {
+  return peak_memory_;
+}
+
+void CudaAllocator::reset_peak_memory() {
+  std::lock_guard lock(mutex_);
+  peak_memory_ = 0;
+}
+
+size_t CudaAllocator::get_memory_limit() {
+  return memory_limit_;
+}
+
+size_t CudaAllocator::set_memory_limit(size_t limit) {
+  std::lock_guard lock(mutex_);
+  std::swap(limit, memory_limit_);
+  return limit;
+}
+
+size_t CudaAllocator::get_cache_memory() const {
+  return buffer_cache_.cache_size();
+}
+
+size_t CudaAllocator::set_cache_limit(size_t limit) {
+  std::lock_guard lk(mutex_);
+  std::swap(limit, max_pool_size_);
+  return limit;
+}
+
+void CudaAllocator::clear_cache() {
+  std::lock_guard lk(mutex_);
+  buffer_cache_.clear();
+}
+
+CudaAllocator& allocator() {
+  // By creating the |allocator_| on heap, the destructor of CudaAllocator
+  // will not be called on exit and buffers in the cache will be leaked. This
+  // can save some time at program exit.
+  static CudaAllocator* allocator_ = new CudaAllocator;
+  return *allocator_;
+}
+
+} // namespace cu
+
+namespace allocator {
+
+Allocator& allocator() {
+  return cu::allocator();
+}
+
+void* Buffer::raw_ptr() {
+  if (!ptr_) {
+    return nullptr;
+  }
+  return static_cast<cu::CudaBuffer*>(ptr_)->data;
+}
+
+} // namespace allocator
+
+size_t get_active_memory() {
+  return cu::allocator().get_active_memory();
+}
+size_t get_peak_memory() {
+  return cu::allocator().get_peak_memory();
+}
+void reset_peak_memory() {
+  return cu::allocator().reset_peak_memory();
+}
+size_t set_memory_limit(size_t limit) {
+  return cu::allocator().set_memory_limit(limit);
+}
+size_t get_memory_limit() {
+  return cu::allocator().get_memory_limit();
+}
+size_t get_cache_memory() {
+  return cu::allocator().get_cache_memory();
+}
+size_t set_cache_limit(size_t limit) {
+  return cu::allocator().set_cache_limit(limit);
+}
+void clear_cache() {
+  cu::allocator().clear_cache();
+}
+
+// Not supported in CUDA.
+size_t set_wired_limit(size_t) {
+  return 0;
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/allocator.h
+++ b/mlx/backend/cuda/allocator.h
@@ -0,0 +1,67 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/buffer_cache.h"
+
+#include <mutex>
+#include <set>
+#include <thread>
+#include <utility>
+
+namespace mlx::core::cu {
+
+class Worker;
+
+using allocator::Buffer;
+
+// Stores cuda-managed unified memory.
+struct CudaBuffer {
+  void* data;
+  size_t size;
+};
+
+class CudaAllocator : public allocator::Allocator {
+ public:
+  Buffer malloc(size_t size) override;
+  void free(Buffer buffer) override;
+  size_t size(Buffer buffer) const override;
+
+  // Register current thread as safe to free buffers.
+  // In cuda freeing a buffer implicitly synchronizes stream, and for threads
+  // that may be waited by gpu stream (for example cpu stream threads), freeing
+  // buffers there would result in dead lock.
+  void register_this_thread();
+
+  // Call cudaFree in the safe thread.
+  void cuda_free(void* buf);
+
+  size_t get_active_memory() const;
+  size_t get_peak_memory() const;
+  void reset_peak_memory();
+  size_t get_memory_limit();
+  size_t set_memory_limit(size_t limit);
+  size_t get_cache_memory() const;
+  size_t set_cache_limit(size_t limit);
+  void clear_cache();
+
+ private:
+  CudaAllocator();
+  friend CudaAllocator& allocator();
+
+  std::mutex worker_mutex_;
+  std::unique_ptr<Worker> worker_;
+  std::set<std::thread::id> allowed_threads_;
+
+  std::mutex mutex_;
+  size_t memory_limit_;
+  size_t max_pool_size_;
+  BufferCache<CudaBuffer> buffer_cache_;
+  size_t active_memory_{0};
+  size_t peak_memory_{0};
+};
+
+CudaAllocator& allocator();
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -0,0 +1,182 @@
+// Copyright © 2025 Apple Inc.
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/fp16_math.cuh"
+#include "mlx/backend/cuda/iterators/strided_iterator.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <cooperative_groups.h>
+#include <nvtx3/nvtx3.hpp>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+
+#include <cassert>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename T>
+struct IndexValPair {
+  uint32_t index;
+  T val;
+};
+
+template <typename T>
+struct ArgMin {
+  constexpr __device__ T init() {
+    return Limits<T>::max();
+  }
+
+  __device__ IndexValPair<T> operator()(
+      const IndexValPair<T>& best,
+      const IndexValPair<T>& current) {
+    if (best.val > current.val ||
+        (best.val == current.val && best.index > current.index)) {
+      return current;
+    } else {
+      return best;
+    }
+  }
+
+  template <int N>
+  __device__ IndexValPair<T>
+  reduce_many(IndexValPair<T> best, T (&vals)[N], uint32_t offset) {
+    for (int i = 0; i < N; i++) {
+      if (vals[i] < best.val) {
+        best.val = vals[i];
+        best.index = offset + i;
+      }
+    }
+    return best;
+  }
+};
+
+template <typename T>
+struct ArgMax {
+  constexpr __device__ T init() {
+    return Limits<T>::min();
+  }
+
+  __device__ IndexValPair<T> operator()(
+      const IndexValPair<T>& best,
+      const IndexValPair<T>& current) {
+    if (best.val < current.val ||
+        (best.val == current.val && best.index > current.index)) {
+      return current;
+    } else {
+      return best;
+    }
+  }
+
+  template <int N>
+  __device__ IndexValPair<T>
+  reduce_many(IndexValPair<T> best, T (&vals)[N], uint32_t offset) {
+    for (int i = 0; i < N; i++) {
+      if (vals[i] > best.val) {
+        best.val = vals[i];
+        best.index = offset + i;
+      }
+    }
+    return best;
+  }
+};
+
+template <typename T, typename Op, int BLOCK_DIM, int N_READS = 4>
+__global__ void arg_reduce_general(
+    const T* in,
+    uint32_t* out,
+    size_t size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides in_strides,
+    const __grid_constant__ Strides out_strides,
+    int32_t ndim,
+    int64_t axis_stride,
+    int32_t axis_size) {
+  auto block = cg::this_thread_block();
+
+  int64_t index = cg::this_grid().block_rank();
+  if (index >= size) {
+    return;
+  }
+
+  int64_t in_idx = elem_to_loc(index, shape.data(), in_strides.data(), ndim);
+  int64_t out_idx = elem_to_loc(index, shape.data(), out_strides.data(), ndim);
+
+  Op op;
+  T init = op.init();
+  IndexValPair<T> best{0, init};
+
+  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
+    T vals[N_READS];
+    auto tid = r * BLOCK_DIM + block.thread_index().x;
+    cub::LoadDirectBlocked(
+        tid, strided_iterator(in + in_idx, axis_stride), vals, axis_size, init);
+    best = op.reduce_many(best, vals, tid * N_READS);
+  }
+
+  typedef cub::BlockReduce<IndexValPair<T>, BLOCK_DIM> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp;
+
+  best = BlockReduceT(temp).Reduce(best, op);
+
+  if (block.thread_rank() == 0) {
+    out[out_idx] = best.index;
+  }
+}
+
+} // namespace cu
+
+void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("ArgReduce::eval_gpu");
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  out.set_data(allocator::malloc(out.nbytes()));
+  auto& s = stream();
+
+  // Prepare the shapes, strides and axis arguments.
+  Shape shape = remove_index(in.shape(), axis_);
+  Strides in_strides = remove_index(in.strides(), axis_);
+  Strides out_strides = out.ndim() == in.ndim()
+      ? remove_index(out.strides(), axis_)
+      : out.strides();
+  int64_t axis_stride = in.strides()[axis_];
+  int32_t axis_size = in.shape()[axis_];
+  int32_t ndim = shape.size();
+
+  // ArgReduce.
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
+    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    constexpr uint32_t N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
+      auto kernel =
+          cu::arg_reduce_general<T, cu::ArgMax<T>, block_dim(), N_READS>;
+      if (reduce_type_ == ArgReduce::ArgMin) {
+        kernel = cu::arg_reduce_general<T, cu::ArgMin<T>, block_dim(), N_READS>;
+      }
+      encoder.add_kernel_node(
+          kernel,
+          num_blocks,
+          block_dim(),
+          in.data<T>(),
+          out.data<uint32_t>(),
+          out.size(),
+          const_param(shape),
+          const_param(in_strides),
+          const_param(out_strides),
+          ndim,
+          axis_stride,
+          axis_size);
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/bin2h.cmake
+++ b/mlx/backend/cuda/bin2h.cmake
@@ -0,0 +1,150 @@
+# Based on: https://github.com/sivachandran/cmake-bin2h
+#
+# Copyright 2020 Sivachandran Paramasivam
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+include(CMakeParseArguments)
+
+# Function to wrap a given string into multiple lines at the given column
+# position.
+#
+# Parameters:
+#
+# * VARIABLE - The name of the CMake variable holding the string.
+# * AT_COLUMN - The column position at which string will be wrapped.
+function(WRAP_STRING)
+  set(oneValueArgs VARIABLE AT_COLUMN)
+  cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
+
+  string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
+  math(EXPR offset "0")
+
+  while(stringLength GREATER 0)
+    if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
+      math(EXPR length "${WRAP_STRING_AT_COLUMN}")
+    else()
+      math(EXPR length "${stringLength}")
+    endif()
+
+    string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
+    set(lines "${lines}\n ${line}")
+
+    math(EXPR stringLength "${stringLength} - ${length}")
+    math(EXPR offset "${offset} + ${length}")
+  endwhile()
+
+  set(${WRAP_STRING_VARIABLE}
+      "${lines}"
+      PARENT_SCOPE)
+endfunction()
+
+# Function to embed contents of a file as byte array in C/C++ header file(.h).
+# The header file will contain a byte array and integer variable holding the
+# size of the array.
+#
+# Parameters:
+#
+# * SOURCE_FILES - The paths of source files whose contents will be embedded in
+#   the header file.
+# * VARIABLE_NAME - The name of the variable for the byte array. The string
+#   "_SIZE" will be append to this name and will be used a variable name for
+#   size variable.
+# * HEADER_FILE - The path of header file.
+# * APPEND - If specified appends to the header file instead of overwriting it
+# * HEADER_NAMESPACE - The namespace, where the array should be located in.
+# * NULL_TERMINATE - If specified a null byte(zero) will be append to the byte
+#   array.
+#
+# Usage:
+#
+# bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
+function(BIN2H)
+  set(options APPEND NULL_TERMINATE)
+  set(oneValueArgs VARIABLE_NAME HEADER_FILE HEADER_NAMESPACE)
+  set(multiValueArgs SOURCE_FILES)
+  cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+
+  set(arrayDefinition "")
+  foreach(SOURCE_FILE IN LISTS BIN2H_SOURCE_FILES)
+    # get filename without extension
+    get_filename_component(FILE_NAME_WE ${SOURCE_FILE} NAME_WE)
+    # convert the filename to a valid C identifier
+    string(MAKE_C_IDENTIFIER "${FILE_NAME_WE}" VALID_FILE_NAME)
+
+    # reads source file contents as hex string
+    file(READ ${SOURCE_FILE} hexString HEX)
+
+    # append null
+    if(BIN2H_NULL_TERMINATE)
+      string(APPEND hexString "00")
+    endif()
+
+    # wraps the hex string into multiple lines
+    wrap_string(VARIABLE hexString AT_COLUMN 24)
+
+    # strip the © in source code
+    string(REGEX REPLACE "c2a9" "2020" arrayValues ${hexString})
+
+    string(REGEX REPLACE "([0-9a-f][0-9a-f])" " 0x\\1," arrayValues
+                         ${arrayValues})
+
+    # make a full variable name for the array
+    set(FULL_VARIABLE_NAME "${BIN2H_VARIABLE_NAME}_${VALID_FILE_NAME}")
+
+    # declares byte array and the length variables
+    string(APPEND arrayDefinition
+           "constexpr char ${FULL_VARIABLE_NAME}[] = {${arrayValues}\n};\n\n")
+  endforeach()
+
+  # add namespace wrapper if defined
+  if(DEFINED BIN2H_HEADER_NAMESPACE)
+    set(namespaceStart "namespace ${BIN2H_HEADER_NAMESPACE} {")
+    set(namespaceEnd "} // namespace ${BIN2H_HEADER_NAMESPACE}")
+    set(declarations "${namespaceStart}\n\n${arrayDefinition}${namespaceEnd}\n")
+  endif()
+
+  set(arrayIncludes "#pragma once")
+  string(PREPEND declarations "${arrayIncludes}\n\n")
+
+  if(BIN2H_APPEND)
+    file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
+  else()
+    file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
+  endif()
+endfunction()
+
+# ----------------------------- CLI args -----------------------------
+
+string(REPLACE ":" ";" MLX_JIT_SOURCES_LIST ${MLX_JIT_SOURCES})
+foreach(source ${MLX_JIT_SOURCES_LIST})
+  list(APPEND MLX_JIT_SOURCES_ABS "${MLX_SOURCE_ROOT}/${source}")
+endforeach()
+
+bin2h(
+  SOURCE_FILES
+  ${MLX_JIT_SOURCES_ABS}
+  NULL_TERMINATE
+  VARIABLE_NAME
+  "jit_source"
+  HEADER_NAMESPACE
+  "mlx::core"
+  HEADER_FILE
+  "${CMAKE_CURRENT_BINARY_DIR}/gen/cuda_jit_sources.h")
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -0,0 +1,362 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/binary_ops.cuh"
+#include "mlx/backend/cuda/device/cucomplex_math.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <cooperative_groups.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void binary_ss(const In* a, const In* b, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (int i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(a[0], b[0]);
+    }
+  } else {
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a[0], b[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void binary_sv(const In* a, const In* b, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(a[0], b[i]);
+    }
+  } else {
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a[0], b_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void binary_vs(const In* a, const In* b, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(a[i], b[0]);
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a_vec.val[i], b[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void binary_vv(const In* a, const In* b, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(a[i], b[i]);
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a_vec.val[i], b_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
+__global__ void binary_g_nd(
+    const In* a,
+    const In* b,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), a_strides.data(), b_strides.data());
+    out[index] = Op{}(a[a_idx], b[b_idx]);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void binary_g(
+    const In* a,
+    const In* b,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides a_strides,
+    const __grid_constant__ Strides b_strides,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [a_idx, b_idx] = elem_to_loc_4d(
+        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
+    out[index] = Op{}(a[a_idx], b[b_idx]);
+  }
+}
+
+template <typename Op, typename In, typename Out>
+constexpr bool supports_binary_op() {
+  if (std::is_same_v<Op, Add> || std::is_same_v<Op, Divide> ||
+      std::is_same_v<Op, Maximum> || std::is_same_v<Op, Minimum> ||
+      std::is_same_v<Op, Multiply> || std::is_same_v<Op, Subtract> ||
+      std::is_same_v<Op, Power> || std::is_same_v<Op, Remainder>) {
+    return std::is_same_v<In, Out>;
+  }
+  if (std::is_same_v<Op, Equal> || std::is_same_v<Op, Greater> ||
+      std::is_same_v<Op, GreaterEqual> || std::is_same_v<Op, Less> ||
+      std::is_same_v<Op, LessEqual> || std::is_same_v<Op, NotEqual>) {
+    return std::is_same_v<Out, bool>;
+  }
+  if (std::is_same_v<Op, LogicalAnd> || std::is_same_v<Op, LogicalOr>) {
+    return std::is_same_v<Out, bool> && std::is_same_v<In, bool>;
+  }
+  if (std::is_same_v<Op, NaNEqual>) {
+    return std::is_same_v<Out, bool> && is_inexact_v<In>;
+  }
+  if (std::is_same_v<Op, LogAddExp>) {
+    return std::is_same_v<In, Out> && is_inexact_v<In>;
+  }
+  if (std::is_same_v<Op, ArcTan2>) {
+    return std::is_same_v<In, Out> && is_floating_v<In>;
+  }
+  if (std::is_same_v<Op, BitwiseAnd> || std::is_same_v<Op, BitwiseOr> ||
+      std::is_same_v<Op, BitwiseXor>) {
+    return std::is_same_v<In, Out> && std::is_integral_v<In>;
+  }
+  if (std::is_same_v<Op, LeftShift> || std::is_same_v<Op, RightShift>) {
+    return std::is_same_v<In, Out> && std::is_integral_v<In> &&
+        !std::is_same_v<In, bool>;
+  }
+  return false;
+}
+
+} // namespace cu
+
+template <typename Op>
+void binary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    array& out,
+    std::string_view op,
+    const Stream& s) {
+  assert(inputs.size() > 1);
+  const auto& a = inputs[0];
+  const auto& b = inputs[1];
+  if (out.size() == 0) {
+    return;
+  }
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+      if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+        using InType = cuda_type_t<CTYPE_IN>;
+        using OutType = cuda_type_t<CTYPE_OUT>;
+        auto bopt = get_binary_op_type(a, b);
+        if (bopt == BinaryOpType::General) {
+          dispatch_bool(
+              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                  out.data_size() > INT32_MAX,
+              [&](auto large) {
+                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                Shape shape;
+                std::vector<Strides> strides;
+                std::tie(shape, strides) = collapse_contiguous_dims(a, b, out);
+                auto& a_strides = strides[0];
+                auto& b_strides = strides[1];
+                int ndim = shape.size();
+                if (ndim <= 3) {
+                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                    auto kernel = cu::
+                        binary_g_nd<Op, InType, OutType, IdxT, dims_constant()>;
+                    auto [num_blocks, block_dims] =
+                        get_launch_args(kernel, out, large());
+                    encoder.add_kernel_node(
+                        kernel,
+                        num_blocks,
+                        block_dims,
+                        a.data<InType>(),
+                        b.data<InType>(),
+                        out.data<OutType>(),
+                        out.size(),
+                        const_param<dims_constant()>(shape),
+                        const_param<dims_constant()>(a_strides),
+                        const_param<dims_constant()>(b_strides));
+                  });
+                } else {
+                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out, large());
+                  encoder.add_kernel_node(
+                      kernel,
+                      num_blocks,
+                      block_dims,
+                      a.data<InType>(),
+                      b.data<InType>(),
+                      out.data<OutType>(),
+                      out.size(),
+                      const_param(shape),
+                      const_param(a_strides),
+                      const_param(b_strides),
+                      ndim);
+                }
+              });
+        } else {
+          dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
+            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+            // TODO: Choose optimized value based on type size.
+            constexpr int N_READS = 4;
+            auto kernel = cu::binary_ss<Op, InType, OutType, IdxT, N_READS>;
+            if (bopt == BinaryOpType::ScalarVector) {
+              kernel = cu::binary_sv<Op, InType, OutType, IdxT, N_READS>;
+            } else if (bopt == BinaryOpType::VectorScalar) {
+              kernel = cu::binary_vs<Op, InType, OutType, IdxT, N_READS>;
+            } else if (bopt == BinaryOpType::VectorVector) {
+              kernel = cu::binary_vv<Op, InType, OutType, IdxT, N_READS>;
+            }
+            auto [num_blocks, block_dims] = get_launch_args(
+                kernel,
+                out.data_size(),
+                out.shape(),
+                out.strides(),
+                large(),
+                N_READS);
+            encoder.add_kernel_node(
+                kernel,
+                num_blocks,
+                block_dims,
+                a.data<InType>(),
+                b.data<InType>(),
+                out.data<OutType>(),
+                out.data_size());
+          });
+        }
+      } else {
+        throw std::runtime_error(fmt::format(
+            "Can not do binary op {} on inputs of {} with result of {}.",
+            op,
+            dtype_to_string(a.dtype()),
+            dtype_to_string(out.dtype())));
+      }
+    });
+  });
+}
+
+template <typename Op>
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    std::string_view op,
+    const Stream& s) {
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+  binary_op_gpu_inplace<Op>(inputs, out, op, s);
+}
+
+#define BINARY_GPU(func)                                                 \
+  void func::eval_gpu(const std::vector<array>& inputs, array& out) {    \
+    nvtx3::scoped_range r(#func "::eval_gpu");                           \
+    auto& s = out.primitive().stream();                                  \
+    binary_op_gpu<cu::func>(inputs, out, get_primitive_string(this), s); \
+  }
+
+BINARY_GPU(Add)
+BINARY_GPU(ArcTan2)
+BINARY_GPU(Divide)
+BINARY_GPU(Remainder)
+BINARY_GPU(Greater)
+BINARY_GPU(GreaterEqual)
+BINARY_GPU(Less)
+BINARY_GPU(LessEqual)
+BINARY_GPU(LogicalAnd)
+BINARY_GPU(LogicalOr)
+BINARY_GPU(LogAddExp)
+BINARY_GPU(Maximum)
+BINARY_GPU(Minimum)
+BINARY_GPU(Multiply)
+BINARY_GPU(NotEqual)
+BINARY_GPU(Power)
+BINARY_GPU(Subtract)
+
+void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Equal::eval_gpu");
+  auto& s = out.primitive().stream();
+  auto op = get_primitive_string(this);
+  if (equal_nan_) {
+    binary_op_gpu<cu::NaNEqual>(inputs, out, op, s);
+  } else {
+    binary_op_gpu<cu::Equal>(inputs, out, op, s);
+  }
+}
+
+void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
+  auto& s = out.primitive().stream();
+  auto op = get_primitive_string(this);
+  switch (op_) {
+    case BitwiseBinary::And:
+      binary_op_gpu<cu::BitwiseAnd>(inputs, out, op, s);
+      break;
+    case BitwiseBinary::Or:
+      binary_op_gpu<cu::BitwiseOr>(inputs, out, op, s);
+      break;
+    case BitwiseBinary::Xor:
+      binary_op_gpu<cu::BitwiseXor>(inputs, out, op, s);
+      break;
+    case BitwiseBinary::LeftShift:
+      binary_op_gpu<cu::LeftShift>(inputs, out, op, s);
+      break;
+    case BitwiseBinary::RightShift:
+      binary_op_gpu<cu::RightShift>(inputs, out, op, s);
+      break;
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -0,0 +1,335 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/binary_ops.cuh"
+#include "mlx/backend/cuda/device/cucomplex_math.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <cooperative_groups.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void
+binary_two_ss(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      auto out = Op{}(a[0], b[0]);
+      out_a[i] = out[0];
+      out_b[i] = out[1];
+    }
+  } else {
+    AlignedVector<Out, N_READS> out_a_vec;
+    AlignedVector<Out, N_READS> out_b_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      auto out = Op{}(a[0], b[0]);
+      out_a_vec.val[i] = out[0];
+      out_b_vec.val[i] = out[1];
+    }
+
+    store_vector<N_READS>(out_a, index, out_a_vec);
+    store_vector<N_READS>(out_b, index, out_b_vec);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void
+binary_two_sv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      auto out = Op{}(a[0], b[i]);
+      out_a[i] = out[0];
+      out_b[i] = out[1];
+    }
+  } else {
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_a_vec;
+    AlignedVector<Out, N_READS> out_b_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      auto out = Op{}(a[0], b_vec.val[i]);
+      out_a_vec.val[i] = out[0];
+      out_b_vec.val[i] = out[1];
+    }
+
+    store_vector<N_READS>(out_a, index, out_a_vec);
+    store_vector<N_READS>(out_b, index, out_b_vec);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void
+binary_two_vs(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      auto out = Op{}(a[i], b[0]);
+      out_a[i] = out[0];
+      out_b[i] = out[1];
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+
+    AlignedVector<Out, N_READS> out_a_vec;
+    AlignedVector<Out, N_READS> out_b_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      auto out = Op{}(a_vec.val[i], b[0]);
+      out_a_vec.val[i] = out[0];
+      out_b_vec.val[i] = out[1];
+    }
+
+    store_vector<N_READS>(out_a, index, out_a_vec);
+    store_vector<N_READS>(out_b, index, out_b_vec);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void
+binary_two_vv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      auto out = Op{}(a[i], b[i]);
+      out_a[i] = out[0];
+      out_b[i] = out[1];
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_a_vec;
+    AlignedVector<Out, N_READS> out_b_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      auto out = Op{}(a_vec.val[i], b_vec.val[i]);
+      out_a_vec.val[i] = out[0];
+      out_b_vec.val[i] = out[1];
+    }
+
+    store_vector<N_READS>(out_a, index, out_a_vec);
+    store_vector<N_READS>(out_b, index, out_b_vec);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
+__global__ void binary_two_g_nd(
+    const In* a,
+    const In* b,
+    Out* out_a,
+    Out* out_b,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), a_strides.data(), b_strides.data());
+    auto out = Op{}(a[a_idx], b[b_idx]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void binary_two_g(
+    const In* a,
+    const In* b,
+    Out* out_a,
+    Out* out_b,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides a_strides,
+    const __grid_constant__ Strides b_strides,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [a_idx, b_idx] = elem_to_loc_4d(
+        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
+    auto out = Op{}(a[a_idx], b[b_idx]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out>
+constexpr bool supports_binary_two_op() {
+  if (std::is_same_v<Op, DivMod>) {
+    return std::is_same_v<In, Out> &&
+        (std::is_integral_v<Out> || is_floating_v<Out>);
+  }
+  return false;
+}
+
+} // namespace cu
+
+template <typename Op>
+void binary_two_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    std::string_view op,
+    const Stream& s) {
+  assert(inputs.size() > 1);
+  const auto& a = inputs[0];
+  const auto& b = inputs[1];
+  auto& out_a = outputs[0];
+  auto& out_b = outputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out_a, bopt);
+  set_binary_op_output_data(a, b, out_b, bopt);
+
+  if (out_a.size() == 0) {
+    return;
+  }
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out_a);
+  encoder.set_output_array(out_b);
+  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out_a.dtype(), [&](auto out_type_tag) {
+      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+      if constexpr (cu::supports_binary_two_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+        using InType = cuda_type_t<CTYPE_IN>;
+        using OutType = cuda_type_t<CTYPE_OUT>;
+
+        auto bopt = get_binary_op_type(a, b);
+        if (bopt == BinaryOpType::General) {
+          dispatch_bool(
+              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                  out_a.data_size() > INT32_MAX,
+              [&](auto large) {
+                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                Shape shape;
+                std::vector<Strides> strides;
+                std::tie(shape, strides) =
+                    collapse_contiguous_dims(a, b, out_a);
+                auto& a_strides = strides[0];
+                auto& b_strides = strides[1];
+                int ndim = shape.size();
+                if (ndim <= 3) {
+                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                    auto kernel = cu::binary_two_g_nd<
+                        Op,
+                        InType,
+                        OutType,
+                        IdxT,
+                        dims_constant()>;
+                    auto [num_blocks, block_dims] =
+                        get_launch_args(kernel, out_a, large());
+                    encoder.add_kernel_node(
+                        kernel,
+                        num_blocks,
+                        block_dims,
+                        a.data<InType>(),
+                        b.data<InType>(),
+                        out_a.data<OutType>(),
+                        out_b.data<OutType>(),
+                        out_a.size(),
+                        const_param<dims_constant()>(shape),
+                        const_param<dims_constant()>(a_strides),
+                        const_param<dims_constant()>(b_strides));
+                  });
+                } else {
+                  auto kernel = cu::binary_two_g<Op, InType, OutType, IdxT>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out_a, large());
+                  encoder.add_kernel_node(
+                      kernel,
+                      num_blocks,
+                      block_dims,
+                      a.data<InType>(),
+                      b.data<InType>(),
+                      out_a.data<OutType>(),
+                      out_b.data<OutType>(),
+                      out_a.size(),
+                      const_param(shape),
+                      const_param(a_strides),
+                      const_param(b_strides),
+                      ndim);
+                }
+              });
+        } else {
+          dispatch_bool(out_a.data_size() > UINT32_MAX, [&](auto large) {
+            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+            // TODO: Choose optimized value based on type size.
+            constexpr int N_READS = 4;
+            auto kernel = cu::binary_two_ss<Op, InType, OutType, IdxT, N_READS>;
+            if (bopt == BinaryOpType::ScalarVector) {
+              kernel = cu::binary_two_sv<Op, InType, OutType, IdxT, N_READS>;
+            } else if (bopt == BinaryOpType::VectorScalar) {
+              kernel = cu::binary_two_vs<Op, InType, OutType, IdxT, N_READS>;
+            } else if (bopt == BinaryOpType::VectorVector) {
+              kernel = cu::binary_two_vv<Op, InType, OutType, IdxT, N_READS>;
+            }
+            auto [num_blocks, block_dims] = get_launch_args(
+                kernel,
+                out_a.data_size(),
+                out_a.shape(),
+                out_a.strides(),
+                large(),
+                N_READS);
+            encoder.add_kernel_node(
+                kernel,
+                num_blocks,
+                block_dims,
+                a.data<InType>(),
+                b.data<InType>(),
+                out_a.data<OutType>(),
+                out_b.data<OutType>(),
+                out_a.data_size());
+          });
+        }
+      } else {
+        throw std::runtime_error(fmt::format(
+            "Can not do binary op {} on inputs of {} with result of {}.",
+            op,
+            dtype_to_string(a.dtype()),
+            dtype_to_string(out_a.dtype())));
+      }
+    });
+  });
+}
+
+template <typename Op>
+void binary_two_op_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    std::string_view op,
+    const Stream& s) {
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, outputs[0], bopt);
+  set_binary_op_output_data(a, b, outputs[1], bopt);
+  binary_two_op_gpu_inplace<Op>(inputs, outputs, op, s);
+}
+
+void DivMod::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  nvtx3::scoped_range r("DivMod::eval_gpu");
+  auto& s = outputs[0].primitive().stream();
+  binary_two_op_gpu<cu::DivMod>(inputs, outputs, get_primitive_string(this), s);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -0,0 +1,233 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/compiled.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/jit_module.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/graph_utils.h"
+#include "mlx/primitives.h"
+
+#include <fmt/format.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace cu {
+
+struct FusedKernelBuilder {
+  std::string os;
+  const std::string& kernel_name;
+  const std::vector<array>& inputs;
+  const std::vector<array>& outputs;
+  const std::vector<array>& tape;
+  const std::function<bool(size_t)>& is_constant;
+
+  void build(const char* name, bool contiguous) {
+    NodeNamer namer;
+
+    // Function parameters.
+    std::vector<std::string> params;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (is_constant(i)) {
+        continue;
+      }
+      const auto& x = inputs[i];
+      const std::string& xname = namer.get_name(x);
+      params.push_back(
+          fmt::format("const {}* {}", dtype_to_cuda_type(x.dtype()), xname));
+      if (!is_scalar(x) && !contiguous) {
+        params.push_back(fmt::format(
+            "const __grid_constant__ cuda::std::array<int64_t, NDIM> {}_strides",
+            xname));
+      }
+    }
+    for (const auto& x : outputs) {
+      params.push_back(fmt::format(
+          "{}* {}", dtype_to_cuda_type(x.dtype()), namer.get_name(x)));
+    }
+    if (!contiguous) {
+      params.push_back(
+          "const __grid_constant__ cuda::std::array<int32_t, NDIM> shape");
+    }
+    params.push_back("IdxT size");
+
+    // Build function signature.
+    if (contiguous) {
+      os += "template <typename IdxT = uint32_t>\n";
+    } else {
+      os += "template <int NDIM, typename IdxT = uint32_t>\n";
+    }
+    os += fmt::format("__global__ void {}(\n", kernel_name + name);
+    for (size_t i = 0; i < params.size(); ++i) {
+      os += "    ";
+      os += params[i];
+      if (i != params.size() - 1) {
+        os += ",\n";
+      }
+    }
+    os += ") {\n";
+
+    // Index.
+    os +=
+        "  IdxT index = cg::this_grid().thread_rank();\n"
+        "  if (index >= size) {\n"
+        "    return;\n"
+        "  }\n";
+
+    // Read inputs.
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      const auto& x = inputs[i];
+      const std::string& xname = namer.get_name(x);
+      std::string type = dtype_to_cuda_type(x.dtype());
+      std::string value;
+      if (is_constant(i)) {
+        std::ostringstream ss;
+        print_constant(ss, x);
+        value = fmt::format("static_cast<{}>({})", type, ss.str());
+      } else if (is_scalar(x)) {
+        value = fmt::format("{}[0]", xname);
+      } else if (contiguous) {
+        value = fmt::format("{}[index]", xname);
+      } else {
+        std::string index = fmt::format(
+            "elem_to_loc_nd<NDIM>(index, shape.data(), {}_strides.data())",
+            xname);
+        value = fmt::format("{}[{}]", xname, index);
+      }
+      os += fmt::format("  {} tmp_{} = {};\n", type, xname, value);
+    }
+
+    // Write tape.
+    for (const auto& x : tape) {
+      const std::string& xname = namer.get_name(x);
+      std::string type = dtype_to_cuda_type(x.dtype());
+      std::string value;
+      if (is_static_cast(x.primitive())) {
+        value = fmt::format(
+            "static_cast<{}>(tmp_{})", type, namer.get_name(x.inputs()[0]));
+      } else {
+        std::ostringstream ss;
+        x.primitive().print(ss);
+        value = ss.str();
+        value += "{}(";
+        for (size_t i = 0; i < x.inputs().size() - 1; ++i) {
+          value += fmt::format("tmp_{}, ", namer.get_name(x.inputs()[i]));
+        }
+        value += fmt::format("tmp_{})", namer.get_name(x.inputs().back()));
+      }
+      os += fmt::format("  {} tmp_{} = {};\n", type, xname, value);
+    }
+
+    // Write output.
+    for (const auto& x : outputs) {
+      os += fmt::format("  {0}[index] = tmp_{0};\n", namer.get_name(x));
+    }
+
+    os += "}\n";
+  }
+};
+
+} // namespace cu
+
+constexpr const char* g_jit_includes = R"(
+#include "mlx/backend/cuda/device/binary_ops.cuh"
+#include "mlx/backend/cuda/device/ternary_ops.cuh"
+#include "mlx/backend/cuda/device/unary_ops.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+#define inf cuda::std::numeric_limits<float>::infinity()
+)";
+
+void Compiled::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  nvtx3::scoped_range r("Compiled::eval_gpu");
+  auto& s = stream();
+
+  cu::JitModule& mod = cu::get_jit_module(s.device, lib_name(), [&]() {
+    // Build source code.
+    cu::FusedKernelBuilder builder{
+        g_jit_includes, lib_name(), inputs_, outputs_, tape_, is_constant_};
+    builder.os +=
+        "namespace mlx::core::cu {\n\n"
+        "namespace cg = cooperative_groups;\n\n";
+    builder.build("_contiguous", true);
+    builder.os += "\n";
+    builder.build("_strided", false);
+    builder.os += "\n} // namespace mlx::core::cu\n";
+    // Build kernel names.
+    std::vector<std::string> kernel_names = {
+        fmt::format("mlx::core::cu::{}_contiguous<uint32_t>", lib_name()),
+        fmt::format("mlx::core::cu::{}_contiguous<int64_t>", lib_name()),
+    };
+    for (int i = 1; i <= MAX_NDIM; ++i) {
+      kernel_names.push_back(fmt::format(
+          "mlx::core::cu::{}_strided<{}, uint32_t>", lib_name(), i));
+      kernel_names.push_back(
+          fmt::format("mlx::core::cu::{}_strided<{}, int64_t>", lib_name(), i));
+    }
+    return std::make_pair(std::move(builder.os), std::move(kernel_names));
+  });
+
+  // Collapse contiguous dims to route to a faster kernel if possible. Also
+  // handle all broadcasting.
+  auto [contiguous, shape, strides_vec] =
+      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);
+
+  // Whether to use large index.
+  bool large = compiled_use_large_index(inputs, outputs, contiguous);
+
+  cu::KernelArgs args;
+  // Put inputs.
+  int strides_index = 1;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (is_constant_(i)) {
+      continue;
+    }
+    const auto& x = inputs[i];
+    args.append(x);
+    if (!contiguous && !is_scalar(x)) {
+      args.append_ptr(strides_vec[strides_index++].data());
+    }
+  }
+
+  // Put outputs.
+  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);
+  for (auto& x : outputs) {
+    args.append(x);
+  }
+
+  // Put shape and size.
+  if (!contiguous) {
+    args.append_ptr(shape.data());
+  }
+  if (large) {
+    args.append<int64_t>(outputs[0].data_size());
+  } else {
+    args.append<uint32_t>(outputs[0].data_size());
+  }
+
+  // Launch kernel.
+  const char* index_type = large ? "int64_t" : "uint32_t";
+  std::string kernel_name = fmt::format("mlx::core::cu::{}", lib_name());
+  if (contiguous) {
+    kernel_name += fmt::format("_contiguous<{}>", index_type);
+  } else {
+    kernel_name += fmt::format("_strided<{}, {}>", shape.size(), index_type);
+  }
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  for (const auto& out : outputs) {
+    encoder.set_output_array(out);
+  }
+
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, outputs[0], large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -0,0 +1,87 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+namespace mlx::core {
+
+void copy_gpu_inplace(
+    const array& in,
+    array& out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out,
+    int64_t offset_in,
+    int64_t offset_out,
+    CopyType ctype,
+    const Stream& s,
+    const std::optional<array>& dynamic_offset_in,
+    const std::optional<array>& dynamic_offset_out) {
+  if (out.size() == 0) {
+    return;
+  }
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  if (ctype == CopyType::Scalar || ctype == CopyType::Vector) {
+    copy_contiguous(encoder, ctype, in, out, offset_in, offset_out);
+    return;
+  }
+
+  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
+    auto [shape_collapsed, strides_vec] = collapse_contiguous_dims(
+        shape, std::vector{strides_in, strides_out}, INT32_MAX);
+    if (ctype == CopyType::General) {
+      copy_general_input(
+          encoder,
+          ctype,
+          in,
+          out,
+          offset_in,
+          offset_out,
+          shape_collapsed,
+          strides_vec[0]);
+    } else {
+      if (dynamic_offset_in || dynamic_offset_out) {
+        copy_general_dynamic(
+            encoder,
+            ctype,
+            in,
+            out,
+            offset_in,
+            offset_out,
+            shape_collapsed,
+            strides_vec[0],
+            strides_vec[1],
+            dynamic_offset_in ? *dynamic_offset_in : array(0, int64),
+            dynamic_offset_out ? *dynamic_offset_out : array(0, int64));
+      } else {
+        copy_general(
+            encoder,
+            ctype,
+            in,
+            out,
+            offset_in,
+            offset_out,
+            shape_collapsed,
+            strides_vec[0],
+            strides_vec[1]);
+      }
+    }
+    return;
+  }
+}
+
+void fill_gpu(const array& in, array& out, const Stream& s) {
+  if (out.size() == 0) {
+    return;
+  }
+  out.set_data(allocator::malloc(out.nbytes()));
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy.cuh
+++ b/mlx/backend/cuda/copy/copy.cuh
@@ -0,0 +1,55 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/cast_op.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/backend/gpu/copy.h"
+#include "mlx/dtype_utils.h"
+
+namespace mlx::core {
+
+void copy_contiguous(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out);
+
+void copy_general(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out);
+
+void copy_general_dynamic(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out,
+    const array& dynamic_offset_in,
+    const array& dynamic_offset_out);
+
+void copy_general_input(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in);
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -0,0 +1,93 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT, int N_READS>
+__global__ void copy_s(const In* in, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = cast_to<Out>(in[0]);
+    }
+  } else {
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = cast_to<Out>(in[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
+  }
+}
+
+template <typename In, typename Out, typename IdxT, int N_READS>
+__global__ void copy_v(const In* in, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = cast_to<Out>(in[i]);
+    }
+  } else {
+    auto in_vec = load_vector<N_READS>(in, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = cast_to<Out>(in_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
+  }
+}
+
+} // namespace cu
+
+void copy_contiguous(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t in_offset,
+    int64_t out_offset) {
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
+        using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+        using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+        // TODO: Choose optimized value based on type size.
+        constexpr int N_READS = 4;
+        auto kernel = cu::copy_s<InType, OutType, IdxT, N_READS>;
+        if (ctype == CopyType::Vector) {
+          kernel = cu::copy_v<InType, OutType, IdxT, N_READS>;
+        }
+        auto [num_blocks, block_dims] = get_launch_args(
+            kernel,
+            out.data_size(),
+            out.shape(),
+            out.strides(),
+            large(),
+            N_READS);
+        encoder.add_kernel_node(
+            kernel,
+            num_blocks,
+            block_dims,
+            in.data<InType>() + in_offset,
+            out.data<OutType>() + out_offset,
+            out.data_size());
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -0,0 +1,110 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT, int NDIM>
+__global__ void copy_gg_nd(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), strides_in.data(), strides_out.data());
+    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_gg(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides_in,
+    const __grid_constant__ Strides strides_out,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_4d(
+        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
+    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+} // namespace cu
+
+void copy_general(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out) {
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(
+          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
+            int ndim = shape.size();
+            size_t data_size = 1;
+            for (auto& s : shape)
+              data_size *= s;
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto ndim_constant) {
+                auto kernel =
+                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>;
+                auto [num_blocks, block_dims] = get_launch_args(
+                    kernel, data_size, shape, out.strides(), large());
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
+                    in_ptr,
+                    out_ptr,
+                    data_size,
+                    const_param<ndim_constant()>(shape),
+                    const_param<ndim_constant()>(strides_in),
+                    const_param<ndim_constant()>(strides_out));
+              });
+            } else { // ndim >= 4
+              auto kernel = cu::copy_gg<InType, OutType, IdxT>;
+              auto [num_blocks, block_dims] = get_launch_args(
+                  kernel, data_size, shape, out.strides(), large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  in_ptr,
+                  out_ptr,
+                  data_size,
+                  const_param(shape),
+                  const_param(strides_in),
+                  const_param(strides_out),
+                  ndim);
+            }
+          });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -0,0 +1,117 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT, int NDIM>
+__global__ void copy_gg_dynamic_nd(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out,
+    const int64_t* offset_in,
+    const int64_t* offset_out) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), strides_in.data(), strides_out.data());
+    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_gg_dynamic(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides_in,
+    const __grid_constant__ Strides strides_out,
+    int ndim,
+    const int64_t* offset_in,
+    const int64_t* offset_out) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_4d(
+        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
+    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
+  }
+}
+
+} // namespace cu
+
+void copy_general_dynamic(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out,
+    const array& dynamic_offset_in,
+    const array& dynamic_offset_out) {
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(
+          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
+            int ndim = shape.size();
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                auto kernel = cu::
+                    copy_gg_dynamic_nd<InType, OutType, IdxT, dims_constant()>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out, large());
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
+                    in_ptr,
+                    out_ptr,
+                    out.size(),
+                    const_param<dims_constant()>(shape),
+                    const_param<dims_constant()>(strides_in),
+                    const_param<dims_constant()>(strides_out),
+                    dynamic_offset_in.data<int64_t>(),
+                    dynamic_offset_out.data<int64_t>());
+              });
+            } else { // ndim >= 4
+              auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
+              auto [num_blocks, block_dims] =
+                  get_launch_args(kernel, out, large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  in_ptr,
+                  out_ptr,
+                  out.size(),
+                  const_param(shape),
+                  const_param(strides_in),
+                  const_param(strides_out),
+                  ndim,
+                  dynamic_offset_in.data<int64_t>(),
+                  dynamic_offset_out.data<int64_t>());
+            }
+          });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -0,0 +1,100 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT, int NDIM>
+__global__ void copy_g_nd(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    IdxT idx_in = elem_to_loc_nd<NDIM>(index, shape.data(), strides_in.data());
+    out[index] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_g(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides_in,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    IdxT idx_in = elem_to_loc_4d(index, shape.data(), strides_in.data(), ndim);
+    out[index] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+} // namespace cu
+
+void copy_general_input(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in) {
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(
+          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
+            int ndim = shape.size();
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                auto kernel =
+                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out, large());
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
+                    in_ptr,
+                    out_ptr,
+                    out.size(),
+                    const_param<dims_constant()>(shape),
+                    const_param<dims_constant()>(strides_in));
+              });
+            } else { // ndim >= 4
+              auto kernel = cu::copy_g<InType, OutType, IdxT>;
+              auto [num_blocks, block_dims] =
+                  get_launch_args(kernel, out, large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  in_ptr,
+                  out_ptr,
+                  out.size(),
+                  const_param(shape),
+                  const_param(strides_in),
+                  ndim);
+            }
+          });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/cuda.cpp
+++ b/mlx/backend/cuda/cuda.cpp
@@ -0,0 +1,11 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/cuda.h"
+
+namespace mlx::core::cu {
+
+bool is_available() {
+  return true;
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/cuda.h
+++ b/mlx/backend/cuda/cuda.h
@@ -0,0 +1,10 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+namespace mlx::core::cu {
+
+/* Check if the CUDA backend is available. */
+bool is_available();
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -0,0 +1,339 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/worker.h"
+#include "mlx/utils.h"
+
+#include <fmt/format.h>
+#include <nvtx3/nvtx3.hpp>
+#include <future>
+#include <unordered_set>
+
+namespace mlx::core {
+
+// Can be tuned with MLX_MAX_OPS_PER_BUFFER
+// This should be less than 255
+constexpr int default_max_nodes_per_graph = 20;
+
+int cuda_graph_cache_size() {
+  static int cache_size = []() {
+    return env::get_var("MLX_CUDA_GRAPH_CACHE_SIZE", 100);
+  }();
+  return cache_size;
+}
+
+namespace cu {
+
+Device::Device(int device) : device_(device) {
+  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
+      &compute_capability_major_, cudaDevAttrComputeCapabilityMajor, device_));
+  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
+      &compute_capability_minor_, cudaDevAttrComputeCapabilityMinor, device_));
+  // Validate the requirements of device.
+  int attr = 0;
+  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
+      &attr, cudaDevAttrConcurrentManagedAccess, device_));
+  if (attr != 1) {
+    throw std::runtime_error(fmt::format(
+        "Device {} does not support synchronization in managed memory.",
+        device_));
+  }
+  // The cublasLt handle is used by matmul.
+  make_current();
+  cublasLtCreate(&lt_);
+}
+
+Device::~Device() {
+  cublasLtDestroy(lt_);
+}
+
+void Device::make_current() {
+  // We need to set/get current CUDA device very frequently, cache it to reduce
+  // actual calls of CUDA APIs. This function assumes single-thread in host.
+  static int current = 0;
+  if (current != device_) {
+    CHECK_CUDA_ERROR(cudaSetDevice(device_));
+    current = device_;
+  }
+}
+
+CommandEncoder& Device::get_command_encoder(Stream s) {
+  auto it = encoders_.find(s.index);
+  if (it == encoders_.end()) {
+    it = encoders_.try_emplace(s.index, *this).first;
+  }
+  return it->second;
+}
+
+CommandEncoder::CaptureContext::CaptureContext(CommandEncoder& enc) : enc(enc) {
+  CHECK_CUDA_ERROR(cudaGraphCreate(&graph, 0));
+  CHECK_CUDA_ERROR(
+      cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeGlobal));
+}
+
+CommandEncoder::CaptureContext::~CaptureContext() {
+  CHECK_CUDA_ERROR(cudaStreamEndCapture(enc.stream(), &graph));
+  size_t num_nodes;
+  CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, NULL, &num_nodes));
+  if (num_nodes == 1) {
+    cudaGraphNode_t captured_node;
+    CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, &captured_node, &num_nodes));
+    CUDA_KERNEL_NODE_PARAMS params;
+    CHECK_CUDA_ERROR(cuGraphKernelNodeGetParams(captured_node, &params));
+    cudaGraphNode_t node;
+    CHECK_CUDA_ERROR(cuGraphAddKernelNode(&node, enc.graph_, NULL, 0, &params));
+    enc.insert_graph_dependencies(GraphNode{node, 'K'});
+  } else {
+    cudaGraphNode_t node;
+    CHECK_CUDA_ERROR(
+        cudaGraphAddChildGraphNode(&node, enc.graph_, NULL, 0, graph));
+    enc.insert_graph_dependencies(GraphNode{node, 'G'});
+  }
+  CHECK_CUDA_ERROR(cudaGraphDestroy(graph));
+}
+
+CommandEncoder::ConcurrentContext::ConcurrentContext(CommandEncoder& enc)
+    : enc(enc) {
+  enc.in_concurrent_ = true;
+}
+
+CommandEncoder::ConcurrentContext::~ConcurrentContext() {
+  enc.in_concurrent_ = false;
+
+  // Use an empty graph node for synchronization
+  CommandEncoder::GraphNode empty{NULL, 'E', std::to_string(enc.node_count_++)};
+  enc.empty_node_count_++;
+  CHECK_CUDA_ERROR(cudaGraphAddEmptyNode(&empty.node, enc.graph_, NULL, 0));
+
+  // Insert the concurrent -> empty node dependencies
+  for (auto& from : enc.concurrent_nodes_) {
+    enc.from_nodes_.push_back(from.node);
+    enc.to_nodes_.push_back(empty.node);
+    enc.graph_key_ += from.id;
+    enc.graph_key_ += from.node_type;
+    enc.graph_key_ += empty.id;
+    enc.graph_key_ += empty.node_type;
+  }
+
+  // Insert the input -> concurrent node dependencies without updating output
+  // nodes
+  auto outputs = std::move(enc.active_outputs_);
+  enc.insert_graph_dependencies(std::move(enc.concurrent_nodes_));
+
+  // Update output node to be the empty node
+  for (auto o : outputs) {
+    enc.node_map_.emplace(o, empty).first->second = empty;
+  }
+}
+
+void CommandEncoder::insert_graph_dependencies(GraphNode node) {
+  if (node.node_type == 'G') {
+    graph_node_count_++;
+  }
+  node.id = std::to_string(node_count_++);
+  if (in_concurrent_) {
+    concurrent_nodes_.push_back(std::move(node));
+  } else {
+    std::vector<GraphNode> nodes;
+    nodes.push_back(std::move(node));
+    insert_graph_dependencies(std::move(nodes));
+  }
+}
+
+void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
+  std::vector<GraphNode> deps;
+  {
+    // Dependencies must be added in the same order to produce a consistent
+    // topology
+    std::unordered_set<cudaGraphNode_t> set_deps;
+    for (auto d : active_deps_) {
+      if (auto it = node_map_.find(d); it != node_map_.end()) {
+        auto [_, inserted] = set_deps.insert(it->second.node);
+        if (inserted) {
+          deps.push_back(it->second);
+        }
+      }
+    }
+  }
+  active_deps_.clear();
+
+  for (auto o : active_outputs_) {
+    for (auto& node : nodes) {
+      node_map_.emplace(o, node).first->second = node;
+    }
+  }
+  active_outputs_.clear();
+
+  for (auto& from : deps) {
+    for (auto& to : nodes) {
+      from_nodes_.push_back(from.node);
+      to_nodes_.push_back(to.node);
+      graph_key_ += from.id;
+      graph_key_ += from.node_type;
+      graph_key_ += to.id;
+      graph_key_ += to.node_type;
+    }
+  }
+}
+
+CommandEncoder::CommandEncoder(Device& d) : device_(d), stream_(d) {
+  CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
+}
+
+void clear_graphs(std::unordered_map<std::string, cudaGraphExec_t>& graphs) {
+  for (auto& [_, graph_exec] : graphs) {
+    CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
+  }
+  graphs.clear();
+}
+
+CommandEncoder::~CommandEncoder() {
+  clear_graphs(graph_cache_);
+}
+
+void CommandEncoder::add_completed_handler(std::function<void()> task) {
+  worker_.add_task(std::move(task));
+}
+
+void CommandEncoder::set_input_array(const array& arr) {
+  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
+  active_deps_.push_back(id);
+}
+
+void CommandEncoder::set_output_array(const array& arr) {
+  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
+  active_deps_.push_back(id);
+  active_outputs_.push_back(id);
+}
+
+void CommandEncoder::maybe_commit() {
+  if (node_count_ >= env::max_ops_per_buffer(default_max_nodes_per_graph)) {
+    commit();
+  }
+}
+
+void CommandEncoder::add_kernel_node(
+    void* func,
+    dim3 grid_dim,
+    dim3 block_dim,
+    void** params) {
+  cudaKernelNodeParams kernel_params = {0};
+  kernel_params.func = func;
+  kernel_params.gridDim = grid_dim;
+  kernel_params.blockDim = block_dim;
+  kernel_params.kernelParams = params;
+  cudaGraphNode_t node;
+  CHECK_CUDA_ERROR(
+      cudaGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
+  insert_graph_dependencies(GraphNode{node, 'K'});
+}
+
+void CommandEncoder::add_kernel_node(
+    CUfunction func,
+    dim3 grid_dim,
+    dim3 block_dim,
+    void** params) {
+  CUDA_KERNEL_NODE_PARAMS kernel_params = {0};
+  kernel_params.func = func;
+  kernel_params.gridDimX = grid_dim.x;
+  kernel_params.gridDimY = grid_dim.y;
+  kernel_params.gridDimZ = grid_dim.z;
+  kernel_params.blockDimX = block_dim.x;
+  kernel_params.blockDimY = block_dim.y;
+  kernel_params.blockDimZ = block_dim.z;
+  kernel_params.kernelParams = params;
+  CUgraphNode node;
+  CHECK_CUDA_ERROR(
+      cuGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
+  insert_graph_dependencies(GraphNode{node, 'K'});
+}
+
+void CommandEncoder::commit() {
+  if (!temporaries_.empty()) {
+    add_completed_handler([temporaries = std::move(temporaries_)]() {});
+  }
+  if (node_count_ > 0) {
+    if (!from_nodes_.empty()) {
+      CHECK_CUDA_ERROR(cudaGraphAddDependencies(
+          graph_, from_nodes_.data(), to_nodes_.data(), from_nodes_.size()));
+    }
+
+    graph_key_ += ".";
+    graph_key_ += std::to_string(node_count_);
+    graph_key_ += ".";
+    graph_key_ += std::to_string(graph_node_count_);
+    graph_key_ += ".";
+    graph_key_ += std::to_string(empty_node_count_);
+
+    cudaGraphExec_t& graph_exec = graph_cache_[graph_key_];
+
+    if (graph_exec != nullptr) {
+      cudaGraphExecUpdateResult update_result;
+#if CUDART_VERSION >= 12000
+      cudaGraphExecUpdateResultInfo info;
+      cudaGraphExecUpdate(graph_exec, graph_, &info);
+      update_result = info.result;
+#else
+      cudaGraphNode_t error_node;
+      cudaGraphExecUpdate(graph_exec, graph_, &error_node, &update_result);
+#endif // CUDART_VERSION >= 12000
+      if (update_result != cudaGraphExecUpdateSuccess) {
+        cudaGetLastError(); // reset error
+        CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
+        graph_exec = nullptr;
+      }
+    }
+    if (graph_exec == nullptr) {
+      CHECK_CUDA_ERROR(
+          cudaGraphInstantiate(&graph_exec, graph_, NULL, NULL, 0));
+    }
+    device_.make_current();
+    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
+
+    // TODO smarter cache policy
+    if (graph_cache_.size() > cuda_graph_cache_size()) {
+      clear_graphs(graph_cache_);
+    }
+
+    // Reset state
+    node_count_ = 0;
+    graph_node_count_ = 0;
+    from_nodes_.clear();
+    to_nodes_.clear();
+    graph_key_.clear();
+    node_map_.clear();
+    CHECK_CUDA_ERROR(cudaGraphDestroy(graph_));
+    CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
+  }
+
+  // Put completion handlers in a batch.
+  worker_.end_batch();
+  worker_.commit(stream_);
+}
+
+void CommandEncoder::synchronize() {
+  cudaStreamSynchronize(stream_);
+  auto p = std::make_shared<std::promise<void>>();
+  std::future<void> f = p->get_future();
+  add_completed_handler([p = std::move(p)]() { p->set_value(); });
+  worker_.end_batch();
+  commit();
+  f.wait();
+}
+
+Device& device(mlx::core::Device device) {
+  static std::unordered_map<int, Device> devices;
+  auto it = devices.find(device.index);
+  if (it == devices.end()) {
+    it = devices.try_emplace(device.index, device.index).first;
+  }
+  return it->second;
+}
+
+CommandEncoder& get_command_encoder(Stream s) {
+  return device(s.device).get_command_encoder(s);
+}
+
+} // namespace cu
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@@ -0,0 +1,159 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+#include "mlx/backend/cuda/worker.h"
+#include "mlx/stream.h"
+
+#include <cublasLt.h>
+#include <cuda.h>
+#include <thrust/execution_policy.h>
+
+#include <unordered_map>
+
+namespace mlx::core::cu {
+
+class CommandEncoder {
+ public:
+  struct CaptureContext {
+    CaptureContext(CommandEncoder& enc);
+    ~CaptureContext();
+    cudaGraph_t graph;
+    CommandEncoder& enc;
+  };
+  struct ConcurrentContext {
+    ConcurrentContext(CommandEncoder& enc);
+    ~ConcurrentContext();
+    CommandEncoder& enc;
+  };
+
+  explicit CommandEncoder(Device& d);
+  ~CommandEncoder();
+
+  CommandEncoder(const CommandEncoder&) = delete;
+  CommandEncoder& operator=(const CommandEncoder&) = delete;
+
+  CaptureContext capture_context() {
+    return CaptureContext{*this};
+  }
+  ConcurrentContext concurrent_context() {
+    return ConcurrentContext{*this};
+  }
+
+  void set_input_array(const array& arr);
+  void set_output_array(const array& arr);
+
+  template <typename F, typename... Params>
+  void
+  add_kernel_node(F* func, dim3 grid_dim, dim3 block_dim, Params&&... params) {
+    constexpr size_t num = sizeof...(Params);
+    void* ptrs[num];
+    size_t i = 0;
+    ([&](auto&& p) { ptrs[i++] = static_cast<void*>(&p); }(
+         std::forward<Params>(params)),
+     ...);
+    add_kernel_node((void*)func, grid_dim, block_dim, ptrs);
+  }
+
+  void add_kernel_node(
+      CUfunction func,
+      dim3 grid_dim,
+      dim3 block_dim,
+      void** params);
+
+  void
+  add_kernel_node(void* func, dim3 grid_dim, dim3 block_dim, void** params);
+
+  void add_temporary(const array& arr) {
+    temporaries_.push_back(arr.data_shared_ptr());
+  }
+
+  void add_completed_handler(std::function<void()> task);
+  void maybe_commit();
+  void commit();
+
+  CudaStream& stream() {
+    return stream_;
+  }
+
+  // Wait until kernels and completion handlers are finished
+  void synchronize();
+
+ private:
+  struct GraphNode {
+    cudaGraphNode_t node;
+    // K = kernel
+    // E = empty
+    // G = subgraph
+    char node_type;
+    std::string id;
+  };
+
+  void insert_graph_dependencies(GraphNode node);
+  void insert_graph_dependencies(std::vector<GraphNode> nodes);
+
+  Device& device_;
+  CudaStream stream_;
+  cudaGraph_t graph_;
+  Worker worker_;
+  char node_count_{0};
+  char graph_node_count_{0};
+  char empty_node_count_{0};
+  bool in_concurrent_{false};
+  std::vector<cudaGraphNode_t> from_nodes_;
+  std::vector<cudaGraphNode_t> to_nodes_;
+  std::string graph_key_;
+  std::vector<GraphNode> concurrent_nodes_;
+  std::vector<std::shared_ptr<array::Data>> temporaries_;
+  std::unordered_map<std::string, cudaGraphExec_t> graph_cache_;
+  std::vector<std::uintptr_t> active_deps_;
+  std::vector<std::uintptr_t> active_outputs_;
+  std::unordered_map<std::uintptr_t, GraphNode> node_map_;
+};
+
+class Device {
+ public:
+  explicit Device(int device);
+  ~Device();
+
+  Device(const Device&) = delete;
+  Device& operator=(const Device&) = delete;
+
+  // Make this device the current cuda device, required by some cuda calls.
+  void make_current();
+
+  CommandEncoder& get_command_encoder(Stream s);
+
+  int cuda_device() const {
+    return device_;
+  }
+  int compute_capability_major() const {
+    return compute_capability_major_;
+  }
+  int compute_capability_minor() const {
+    return compute_capability_minor_;
+  }
+  cublasLtHandle_t lt_handle() const {
+    return lt_;
+  }
+
+ private:
+  int device_;
+  int compute_capability_major_;
+  int compute_capability_minor_;
+  cublasLtHandle_t lt_;
+  std::unordered_map<int, CommandEncoder> encoders_;
+};
+
+Device& device(mlx::core::Device device);
+CommandEncoder& get_command_encoder(Stream s);
+
+// Return an execution policy that does not sync for result.
+// Note that not all thrust APIs support async policy, confirm before using.
+inline auto thrust_policy(cudaStream_t stream) {
+  // TODO: Connect thrust's custom allocator with mlx's allocator.
+  return thrust::cuda::par_nosync.on(stream);
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/arange.cuh
+++ b/mlx/backend/cuda/device/arange.cuh
@@ -0,0 +1,15 @@
+// Copyright © 2025 Apple Inc.
+
+namespace mlx::core::cu {
+
+template <typename T>
+struct Arange {
+  const T start;
+  const T step;
+
+  __device__ T operator()(uint32_t i) const {
+    return start + i * step;
+  }
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/atomic_ops.cuh
+++ b/mlx/backend/cuda/device/atomic_ops.cuh
@@ -0,0 +1,72 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/device/cucomplex_math.cuh"
+#include "mlx/backend/cuda/device/fp16_math.cuh"
+
+#include <cuda/atomic>
+
+namespace mlx::core::cu {
+
+template <typename T>
+inline __device__ void atomic_add(T* out, T val) {
+  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
+  ref += val;
+}
+
+template <typename T>
+inline __device__ void atomic_prod(T* out, T val) {
+  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
+  T old = ref.load();
+  while (!ref.compare_exchange_strong(old, old * val)) {
+  }
+}
+
+template <typename T>
+inline __device__ void atomic_max(T* out, T val) {
+  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
+  ref.fetch_max(val);
+}
+
+template <typename T>
+inline __device__ void atomic_min(T* out, T val) {
+  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
+  ref.fetch_min(val);
+}
+
+// Somehow cuda::atomic_ref does not provide atomic add for following types.
+template <typename T>
+inline __device__ void atomic_add_general(T* out, T val) {
+  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
+  T old = ref.load();
+  while (!ref.compare_exchange_strong(old, old + val)) {
+  }
+}
+
+inline __device__ void atomic_add(__half* out, __half val) {
+  atomicAdd(out, val);
+}
+
+inline __device__ void atomic_add(cuComplex* out, cuComplex val) {
+#if __CUDA_ARCH__ < 900
+  atomic_add_general(out, val);
+#else
+  atomicAdd(out, val);
+#endif
+}
+
+inline __device__ void atomic_add(__nv_bfloat16* out, __nv_bfloat16 val) {
+#if __CUDA_ARCH__ < 800
+#if CCCL_VERSION >= 2008000
+  atomic_add_general(out, val);
+#else
+  bool cccl_version_too_old_for_bfloat16_atomic_add = false;
+  assert(cccl_version_too_old_for_bfloat16_atomic_add);
+#endif
+#else
+  atomicAdd(out, val);
+#endif
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/binary_ops.cuh
+++ b/mlx/backend/cuda/device/binary_ops.cuh
@@ -0,0 +1,306 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device/unary_ops.cuh"
+
+#include <cuda/std/array>
+
+namespace mlx::core::cu {
+
+struct Add {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x + y;
+  }
+};
+
+struct FloorDivide {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      return x / y;
+    } else {
+      return truncf(x / y);
+    }
+  }
+};
+
+struct Divide {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x / y;
+  }
+};
+
+struct Remainder {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      if constexpr (cuda::std::is_signed_v<T>) {
+        auto r = x % y;
+        if (r != 0 && (r < 0 != y < 0)) {
+          r += y;
+        }
+        return r;
+      } else {
+        return x % y;
+      }
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return x % y;
+    } else {
+      T r = fmod(x, y);
+      if (r != 0 && (r < 0 != y < 0)) {
+        r = r + y;
+      }
+      return r;
+    }
+  }
+};
+
+struct Equal {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    return x == y;
+  }
+};
+
+struct NaNEqual {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    if constexpr (std::is_same_v<T, cuComplex>) {
+      return x == y ||
+          (isnan(cuCrealf(x)) && isnan(cuCrealf(y)) && isnan(cuCimagf(x)) &&
+           isnan(cuCimagf(y))) ||
+          (cuCrealf(x) == cuCrealf(y) && isnan(cuCimagf(x)) &&
+           isnan(cuCimagf(y))) ||
+          (isnan(cuCrealf(x)) && isnan(cuCrealf(y)) &&
+           cuCimagf(x) == cuCimagf(y));
+    } else {
+      return x == y || (isnan(x) && isnan(y));
+    }
+  }
+};
+
+struct Greater {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    return x > y;
+  }
+};
+
+struct GreaterEqual {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    return x >= y;
+  }
+};
+
+struct Less {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    return x < y;
+  }
+};
+
+struct LessEqual {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    return x <= y;
+  }
+};
+
+struct LogAddExp {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      if (isnan(cuCrealf(x)) || isnan(cuCimagf(x)) || isnan(cuCrealf(y)) ||
+          isnan(cuCimagf(y))) {
+        return {
+            cuda::std::numeric_limits<float>::quiet_NaN(),
+            cuda::std::numeric_limits<float>::quiet_NaN()};
+      }
+      auto max = cuCrealf(x) > cuCrealf(y) ? x : y;
+      auto min = cuCrealf(x) < cuCrealf(y) ? x : y;
+      auto min_real = cuCrealf(min);
+      auto max_real = cuCrealf(max);
+      if (!isfinite(min_real) && (min_real == max_real)) {
+        if (min_real < 0) {
+          return min;
+        } else {
+          return Log{}(Exp{}(min) + Exp{}(max));
+        }
+      } else {
+        return Log1p{}(Exp{}(min - max)) + max;
+      }
+    } else {
+      if (isnan(x) || isnan(y)) {
+        return cuda::std::numeric_limits<T>::quiet_NaN();
+      }
+      T maxval = max(x, y);
+      T minval = min(x, y);
+      return (minval == -cuda::std::numeric_limits<T>::infinity() ||
+              maxval == cuda::std::numeric_limits<T>::infinity())
+          ? maxval
+          : T(float(maxval) + log1p(expf(minval - maxval)));
+    }
+  };
+};
+
+struct Maximum {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      return max(x, y);
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      if (isnan(cuCrealf(x)) || isnan(cuCimagf(x))) {
+        return x;
+      }
+      return x > y ? x : y;
+    } else {
+      if (isnan(x)) {
+        return x;
+      }
+      return x > y ? x : y;
+    }
+  }
+};
+
+struct Minimum {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      return min(x, y);
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      if (isnan(cuCrealf(x)) || isnan(cuCimagf(x))) {
+        return x;
+      }
+      return x < y ? x : y;
+    } else {
+      if (isnan(x)) {
+        return x;
+      }
+      return x < y ? x : y;
+    }
+  }
+};
+
+struct Multiply {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x * y;
+  }
+};
+
+struct NotEqual {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    if constexpr (std::is_same_v<T, cuComplex>) {
+      return cuCrealf(x) != cuCrealf(y) || cuCimagf(x) != cuCimagf(y);
+    } else {
+      return x != y;
+    }
+  }
+};
+
+struct Power {
+  template <typename T>
+  __device__ T operator()(T base, T exp) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      T res = 1;
+      while (exp) {
+        if (exp & 1) {
+          res *= base;
+        }
+        exp >>= 1;
+        base *= base;
+      }
+      return res;
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      if (base.y == 0 && base.x == 0) {
+        if (isnan(exp.x) || isnan(exp.y)) {
+          auto nan = cuda::std::numeric_limits<float>::quiet_NaN();
+          return make_cuFloatComplex(nan, nan);
+        }
+        return make_cuFloatComplex(0.0, 0.0);
+      }
+      auto x_theta = atan2f(base.y, base.x);
+      auto x_ln_r = 0.5 * logf(base.x * base.x + base.y * base.y);
+      auto mag = expf(exp.x * x_ln_r - exp.y * x_theta);
+      auto phase = exp.y * x_ln_r + exp.x * x_theta;
+      return make_cuFloatComplex(mag * cosf(phase), mag * sinf(phase));
+    } else {
+      return powf(base, exp);
+    }
+  }
+};
+
+struct Subtract {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x - y;
+  }
+};
+
+struct LogicalAnd {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x && y;
+  };
+};
+
+struct LogicalOr {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x || y;
+  };
+};
+
+struct BitwiseAnd {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x & y;
+  };
+};
+
+struct BitwiseOr {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x | y;
+  };
+};
+
+struct BitwiseXor {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x ^ y;
+  };
+};
+
+struct LeftShift {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x << y;
+  };
+};
+
+struct RightShift {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x >> y;
+  };
+};
+
+struct ArcTan2 {
+  template <typename T>
+  __device__ T operator()(T y, T x) {
+    return atan2f(y, x);
+  }
+};
+
+struct DivMod {
+  template <typename T>
+  __device__ cuda::std::array<T, 2> operator()(T x, T y) {
+    return {FloorDivide{}(x, y), Remainder{}(x, y)};
+  };
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/cast_op.cuh
+++ b/mlx/backend/cuda/device/cast_op.cuh
@@ -0,0 +1,138 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <cuComplex.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace mlx::core::cu {
+
+// An op that does static_cast, with custom conversions for some types.
+template <typename SrcT, typename DstT, typename = void>
+struct CastOp {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, DstT>;
+
+  __device__ DstT operator()(SrcT x) {
+    return static_cast<DstT>(x);
+  }
+};
+
+// Castings between complex and boolean.
+// TODO: Should make a custom complex type.
+template <>
+struct CastOp<cuComplex, bool> {
+  static constexpr bool is_castable = true;
+
+  __device__ bool operator()(cuComplex x) {
+    return x.x != 0 && x.y != 0;
+  }
+};
+
+template <>
+struct CastOp<bool, cuComplex> {
+  static constexpr bool is_castable = true;
+
+  __device__ cuComplex operator()(bool x) {
+    return x ? make_cuFloatComplex(1, 1) : make_cuFloatComplex(0, 0);
+  }
+};
+
+// Converting a complex number to real number discards the imaginary part.
+template <typename DstT>
+struct CastOp<
+    cuComplex,
+    DstT,
+    cuda::std::enable_if_t<!cuda::std::is_same_v<cuComplex, DstT>>> {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<float, DstT>;
+
+  __device__ DstT operator()(cuComplex x) {
+    static_assert(!cuda::std::is_same_v<cuComplex, DstT>);
+    return static_cast<DstT>(cuCrealf(x));
+  }
+};
+
+// Allow converting a real number to complex number.
+template <typename SrcT>
+struct CastOp<
+    SrcT,
+    cuComplex,
+    cuda::std::enable_if_t<!cuda::std::is_same_v<SrcT, cuComplex>>> {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, float>;
+
+  __device__ cuComplex operator()(SrcT x) {
+    static_assert(!cuda::std::is_same_v<SrcT, cuComplex>);
+    return cuComplex{static_cast<float>(x), 0};
+  }
+};
+
+// Do nothing when no casting is needed.
+template <typename SrcT, typename DstT>
+struct CastOp<
+    SrcT,
+    DstT,
+    cuda::std::enable_if_t<cuda::std::is_same_v<SrcT, DstT>>> {
+  static constexpr bool is_castable = true;
+
+  __device__ SrcT operator()(SrcT x) {
+    return x;
+  }
+};
+
+// In CUDA 11 the half types do not define conversions between some types,
+// provide fallbacks here.
+#if CUDART_VERSION < 12000
+template <typename SrcT, typename DstT>
+struct CastOp<
+    SrcT,
+    DstT,
+    cuda::std::enable_if_t<
+        !cuda::std::is_convertible_v<SrcT, DstT> &&
+        !cuda::std::is_same_v<SrcT, cuComplex> &&
+        (cuda::std::is_same_v<DstT, __half> ||
+         cuda::std::is_same_v<DstT, __nv_bfloat16>)>> {
+  static constexpr bool is_castable = true;
+
+  __device__ DstT operator()(SrcT x) {
+    return DstT(static_cast<float>(x));
+  }
+};
+
+template <typename SrcT, typename DstT>
+struct CastOp<
+    SrcT,
+    DstT,
+    cuda::std::enable_if_t<
+        !cuda::std::is_convertible_v<SrcT, DstT> &&
+        !cuda::std::is_same_v<DstT, cuComplex> &&
+        !cuda::std::is_same_v<DstT, __half> &&
+        !cuda::std::is_same_v<DstT, __nv_bfloat16> &&
+        (cuda::std::is_same_v<SrcT, __half> ||
+         cuda::std::is_same_v<SrcT, __nv_bfloat16>)>> {
+  static constexpr bool is_castable = true;
+
+  __device__ DstT operator()(SrcT x) {
+    return DstT(static_cast<float>(x));
+  }
+};
+#endif // CUDART_VERSION < 12000
+
+// Helper to deduce the SrcT.
+template <typename DstT, typename SrcT>
+inline __host__ __device__ auto cast_to(SrcT x) {
+  return CastOp<SrcT, DstT>{}(x);
+}
+
+// Return an iterator that cast the value to DstT using CastOp.
+template <typename DstT, typename Iterator>
+inline __host__ __device__ auto make_cast_iterator(Iterator it) {
+  using SrcT = typename cuda::std::iterator_traits<Iterator>::value_type;
+  if constexpr (std::is_same_v<SrcT, DstT>) {
+    return it;
+  } else {
+    return thrust::make_transform_iterator(it, CastOp<SrcT, DstT>{});
+  }
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/cexpf.cuh
+++ b/mlx/backend/cuda/device/cexpf.cuh
@@ -0,0 +1,138 @@
+// Copyright © 2025 Apple Inc.
+// Copyright © 2008-2013 NVIDIA Corporation
+// Copyright © 2013 Filipe RNC Maia
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Forked from
+// https://github.com/NVIDIA/cccl/blob/main/thrust/thrust/detail/complex/cexpf.h
+
+// TODO: We should use thrust::exp but the thrust header in old CUDA versions
+// can not be used in JIT.
+
+#pragma once
+
+#include <cuComplex.h>
+#include <cuda/std/cstdint>
+
+namespace mlx::core::cu::detail {
+
+using ieee_float_shape_type = union {
+  float value;
+  uint32_t word;
+};
+
+inline __device__ void get_float_word(uint32_t& i, float d) {
+  ieee_float_shape_type gf_u;
+  gf_u.value = (d);
+  (i) = gf_u.word;
+}
+
+inline __device__ void get_float_word(int32_t& i, float d) {
+  ieee_float_shape_type gf_u;
+  gf_u.value = (d);
+  (i) = gf_u.word;
+}
+
+inline __device__ void set_float_word(float& d, uint32_t i) {
+  ieee_float_shape_type sf_u;
+  sf_u.word = (i);
+  (d) = sf_u.value;
+}
+
+inline __device__ float frexp_expf(float x, int* expt) {
+  const uint32_t k = 235;
+  const float kln2 = 162.88958740F;
+
+  float exp_x;
+  uint32_t hx;
+
+  exp_x = expf(x - kln2);
+  get_float_word(hx, exp_x);
+  *expt = (hx >> 23) - (0x7f + 127) + k;
+  set_float_word(exp_x, (hx & 0x7fffff) | ((0x7f + 127) << 23));
+  return exp_x;
+}
+
+inline __device__ cuComplex ldexp_cexpf(cuComplex z, int expt) {
+  float x, y, exp_x, scale1, scale2;
+  int ex_expt, half_expt;
+
+  x = cuCrealf(z);
+  y = cuCimagf(z);
+  exp_x = frexp_expf(x, &ex_expt);
+  expt += ex_expt;
+
+  half_expt = expt / 2;
+  set_float_word(scale1, (0x7f + half_expt) << 23);
+  half_expt = expt - half_expt;
+  set_float_word(scale2, (0x7f + half_expt) << 23);
+
+  return cuComplex{
+      cosf(y) * exp_x * scale1 * scale2, sinf(y) * exp_x * scale1 * scale2};
+}
+
+inline __device__ cuComplex cexpf(const cuComplex& z) {
+  float x, y, exp_x;
+  uint32_t hx, hy;
+
+  const uint32_t exp_ovfl = 0x42b17218, cexp_ovfl = 0x43400074;
+
+  x = cuCrealf(z);
+  y = cuCimagf(z);
+
+  get_float_word(hy, y);
+  hy &= 0x7fffffff;
+
+  /* cexp(x + I 0) = exp(x) + I 0 */
+  if (hy == 0) {
+    return cuComplex{expf(x), y};
+  }
+  get_float_word(hx, x);
+  /* cexp(0 + I y) = cos(y) + I sin(y) */
+  if ((hx & 0x7fffffff) == 0) {
+    return cuComplex{cosf(y), sinf(y)};
+  }
+  if (hy >= 0x7f800000) {
+    if ((hx & 0x7fffffff) != 0x7f800000) {
+      /* cexp(finite|NaN +- I Inf|NaN) = NaN + I NaN */
+      return cuComplex{y - y, y - y};
+    } else if (hx & 0x80000000) {
+      /* cexp(-Inf +- I Inf|NaN) = 0 + I 0 */
+      return cuComplex{0.0, 0.0};
+    } else {
+      /* cexp(+Inf +- I Inf|NaN) = Inf + I NaN */
+      return cuComplex{x, y - y};
+    }
+  }
+
+  if (hx >= exp_ovfl && hx <= cexp_ovfl) {
+    /*
+     * x is between 88.7 and 192, so we must scale to avoid
+     * overflow in expf(x).
+     */
+    return ldexp_cexpf(z, 0);
+  } else {
+    /*
+     * Cases covered here:
+     *  -  x < exp_ovfl and exp(x) won't overflow (common case)
+     *  -  x > cexp_ovfl, so exp(x) * s overflows for all s > 0
+     *  -  x = +-Inf (generated by exp())
+     *  -  x = NaN (spurious inexact exception from y)
+     */
+    exp_x = expf(x);
+    return cuComplex{exp_x * cosf(y), exp_x * sinf(y)};
+  }
+}
+
+} // namespace mlx::core::cu::detail
--- a/mlx/backend/cuda/device/config.h
+++ b/mlx/backend/cuda/device/config.h
@@ -0,0 +1,12 @@
+// Copyright © 2025 Apple Inc.
+
+// This file is used by both CUDA kernel code and host-only C++ code.
+
+#pragma once
+
+// The maximum dimensions of shape/strides passed as kernel parameters.
+#define MAX_NDIM 10
+
+// All existing NVIDIA hardware has a fixed 32 warp size. Though a built-in
+// warpSize variable exists, using it would prevent compile-time optimizations.
+#define WARP_SIZE 32
--- a/mlx/backend/cuda/device/cucomplex_math.cuh
+++ b/mlx/backend/cuda/device/cucomplex_math.cuh
@@ -0,0 +1,240 @@
+// Copyright © 2025 Apple Inc.
+// Copyright © 2017-2024 The Simons Foundation, Inc.
+//
+// FINUFFT is licensed under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance with the
+// License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Forked from
+// https://github.com/flatironinstitute/finufft/blob/main/include/cufinufft/contrib/helper_math.h
+
+#pragma once
+
+#include <cuComplex.h>
+
+// This header provides some helper functions for cuComplex types.
+// It mainly wraps existing CUDA implementations to provide operator overloads
+// e.g. cuAdd, cuSub, cuMul, cuDiv, cuCreal, cuCimag, cuCabs, cuCarg, cuConj are
+// all provided by CUDA
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator+(const cuDoubleComplex& a, const cuDoubleComplex& b) {
+  return cuCadd(a, b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator-(const cuDoubleComplex& a, const cuDoubleComplex& b) {
+  return cuCsub(a, b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator*(const cuDoubleComplex& a, const cuDoubleComplex& b) {
+  return cuCmul(a, b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator/(const cuDoubleComplex& a, const cuDoubleComplex& b) {
+  return cuCdiv(a, b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator%(const cuDoubleComplex& a, const cuDoubleComplex& b) {
+  double r = cuCreal(a) - (floorf(cuCreal(a) / cuCreal(b)) * cuCreal(b));
+  double i = cuCimag(a) - (floorf(cuCimag(a) / cuCimag(b)) * cuCimag(b));
+  return make_cuDoubleComplex(r, i);
+}
+
+__forceinline__ __host__ __device__ bool operator==(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  return cuCreal(a) == cuCreal(b) && cuCimag(a) == cuCimag(b);
+}
+
+__forceinline__ __host__ __device__ bool operator!=(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  return !(a == b);
+}
+
+__forceinline__ __host__ __device__ bool operator>(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  double mag_a = sqrt(cuCreal(a) * cuCreal(a) + cuCimag(a) * cuCimag(a));
+  double mag_b = sqrt(cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b));
+  return mag_a > mag_b;
+}
+
+__forceinline__ __host__ __device__ bool operator>=(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  return a > b || a == b;
+}
+
+__forceinline__ __host__ __device__ bool operator<(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  return b > a;
+}
+
+__forceinline__ __host__ __device__ bool operator<=(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  return b > a || a == b;
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator+(const cuDoubleComplex& a, double b) {
+  return make_cuDoubleComplex(cuCreal(a) + b, cuCimag(a));
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator+(double a, const cuDoubleComplex& b) {
+  return make_cuDoubleComplex(a + cuCreal(b), cuCimag(b));
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator-(const cuDoubleComplex& a, double b) {
+  return make_cuDoubleComplex(cuCreal(a) - b, cuCimag(a));
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator-(double a, const cuDoubleComplex& b) {
+  return make_cuDoubleComplex(a - cuCreal(b), -cuCimag(b));
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator*(const cuDoubleComplex& a, double b) {
+  return make_cuDoubleComplex(cuCreal(a) * b, cuCimag(a) * b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator*(double a, const cuDoubleComplex& b) {
+  return make_cuDoubleComplex(a * cuCreal(b), a * cuCimag(b));
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator/(const cuDoubleComplex& a, double b) {
+  return make_cuDoubleComplex(cuCreal(a) / b, cuCimag(a) / b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator/(double a, const cuDoubleComplex& b) {
+  double denom = cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b);
+  return make_cuDoubleComplex(
+      (a * cuCreal(b)) / denom, (-a * cuCimag(b)) / denom);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator+(const cuFloatComplex& a, const cuFloatComplex& b) {
+  return cuCaddf(a, b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator-(const cuFloatComplex& a, const cuFloatComplex& b) {
+  return cuCsubf(a, b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator*(const cuFloatComplex& a, const cuFloatComplex& b) {
+  return cuCmulf(a, b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator/(const cuFloatComplex& a, const cuFloatComplex& b) {
+  return cuCdivf(a, b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator%(const cuFloatComplex& a, const cuFloatComplex& b) {
+  float r = cuCrealf(a) - (floorf(cuCrealf(a) / cuCrealf(b)) * cuCrealf(b));
+  float i = cuCimagf(a) - (floorf(cuCimagf(a) / cuCimagf(b)) * cuCimagf(b));
+  return make_cuFloatComplex(r, i);
+}
+
+__forceinline__ __host__ __device__ bool operator==(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  return cuCrealf(a) == cuCrealf(b) && cuCimagf(a) == cuCimagf(b);
+}
+
+__forceinline__ __host__ __device__ bool operator!=(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  return !(a == b);
+}
+
+__forceinline__ __host__ __device__ bool operator>(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  float mag_a = sqrt(cuCrealf(a) * cuCrealf(a) + cuCimagf(a) * cuCimagf(a));
+  float mag_b = sqrt(cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b));
+  return mag_a > mag_b;
+}
+
+__forceinline__ __host__ __device__ bool operator>=(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  return a > b || a == b;
+}
+
+__forceinline__ __host__ __device__ bool operator<(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  return b > a;
+}
+
+__forceinline__ __host__ __device__ bool operator<=(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  return b > a || a == b;
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator+(const cuFloatComplex& a, float b) {
+  return make_cuFloatComplex(cuCrealf(a) + b, cuCimagf(a));
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator+(float a, const cuFloatComplex& b) {
+  return make_cuFloatComplex(a + cuCrealf(b), cuCimagf(b));
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator-(const cuFloatComplex& a, float b) {
+  return make_cuFloatComplex(cuCrealf(a) - b, cuCimagf(a));
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator-(float a, const cuFloatComplex& b) {
+  return make_cuFloatComplex(a - cuCrealf(b), -cuCimagf(b));
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator*(const cuFloatComplex& a, float b) {
+  return make_cuFloatComplex(cuCrealf(a) * b, cuCimagf(a) * b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator*(float a, const cuFloatComplex& b) {
+  return make_cuFloatComplex(a * cuCrealf(b), a * cuCimagf(b));
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator/(const cuFloatComplex& a, float b) {
+  return make_cuFloatComplex(cuCrealf(a) / b, cuCimagf(a) / b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator/(float a, const cuFloatComplex& b) {
+  float denom = cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b);
+  return make_cuFloatComplex(
+      (a * cuCrealf(b)) / denom, (-a * cuCimagf(b)) / denom);
+}
--- a/mlx/backend/cuda/device/fp16_math.cuh
+++ b/mlx/backend/cuda/device/fp16_math.cuh
@@ -0,0 +1,194 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+namespace mlx::core::cu {
+
+///////////////////////////////////////////////////////////////////////////////
+// Unary ops for half types.
+///////////////////////////////////////////////////////////////////////////////
+
+#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
+#define MLX_DEFINE_UNARY_OP(NAME, HALF_OP)           \
+  template <typename T>                              \
+  __forceinline__ __device__ auto NAME(T x) {        \
+    if constexpr (cuda::std::is_same_v<T, __half>) { \
+      return HALF_OP(x);                             \
+    } else {                                         \
+      return ::NAME(x);                              \
+    }                                                \
+  }
+#else
+#define MLX_DEFINE_UNARY_OP(NAME, HALF_OP)                         \
+  template <typename T>                                            \
+  __forceinline__ __device__ auto NAME(T x) {                      \
+    if constexpr (cuda::std::is_same_v<T, __half>) {               \
+      return HALF_OP(x);                                           \
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) { \
+      return HALF_OP(x);                                           \
+    } else {                                                       \
+      return ::NAME(x);                                            \
+    }                                                              \
+  }
+#endif
+
+#define MLX_DEFINE_UNARY_OP_FALLBCK(NAME)                          \
+  template <typename T>                                            \
+  __forceinline__ __device__ auto NAME(T x) {                      \
+    if constexpr (cuda::std::is_same_v<T, __half>) {               \
+      return ::NAME(__half2float(x));                              \
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) { \
+      return ::NAME(__bfloat162float(x));                          \
+    } else {                                                       \
+      return ::NAME(x);                                            \
+    }                                                              \
+  }
+
+MLX_DEFINE_UNARY_OP(abs, __habs)
+MLX_DEFINE_UNARY_OP(ceil, hceil)
+MLX_DEFINE_UNARY_OP(cos, hcos)
+MLX_DEFINE_UNARY_OP(exp, hexp)
+MLX_DEFINE_UNARY_OP(floor, hfloor)
+MLX_DEFINE_UNARY_OP(isnan, __hisnan)
+MLX_DEFINE_UNARY_OP(log, hlog)
+MLX_DEFINE_UNARY_OP(log2, hlog2)
+MLX_DEFINE_UNARY_OP(log10, hlog10)
+MLX_DEFINE_UNARY_OP(rint, hrint)
+MLX_DEFINE_UNARY_OP(rsqrt, hrsqrt)
+MLX_DEFINE_UNARY_OP(sin, hsin)
+MLX_DEFINE_UNARY_OP(sqrt, hsqrt)
+MLX_DEFINE_UNARY_OP_FALLBCK(acos)
+MLX_DEFINE_UNARY_OP_FALLBCK(acosh)
+MLX_DEFINE_UNARY_OP_FALLBCK(asin)
+MLX_DEFINE_UNARY_OP_FALLBCK(asinh)
+MLX_DEFINE_UNARY_OP_FALLBCK(atan)
+MLX_DEFINE_UNARY_OP_FALLBCK(atanh)
+MLX_DEFINE_UNARY_OP_FALLBCK(cosh)
+MLX_DEFINE_UNARY_OP_FALLBCK(log1p)
+MLX_DEFINE_UNARY_OP_FALLBCK(sinh)
+MLX_DEFINE_UNARY_OP_FALLBCK(tan)
+#if __CUDA_ARCH__ >= 1280
+MLX_DEFINE_UNARY_OP(tanh, htanh)
+#else
+MLX_DEFINE_UNARY_OP_FALLBCK(tanh)
+#endif
+
+#undef MLX_DEFINE_UNARY_OP
+#undef MLX_DEFINE_UNARY_OP_FALLBCK
+
+///////////////////////////////////////////////////////////////////////////////
+// Binary ops for half types.
+///////////////////////////////////////////////////////////////////////////////
+
+#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
+#define MLX_DEFINE_BINARY_OP(NAME, HALF_OP)          \
+  template <typename T>                              \
+  __forceinline__ __device__ auto NAME(T x, T y) {   \
+    if constexpr (cuda::std::is_same_v<T, __half>) { \
+      return HALF_OP(x, y);                          \
+    } else {                                         \
+      return ::NAME(x, y);                           \
+    }                                                \
+  }
+#else
+#define MLX_DEFINE_BINARY_OP(NAME, HALF_OP)                        \
+  template <typename T>                                            \
+  __forceinline__ __device__ auto NAME(T x, T y) {                 \
+    if constexpr (cuda::std::is_same_v<T, __half>) {               \
+      return HALF_OP(x, y);                                        \
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) { \
+      return HALF_OP(x, y);                                        \
+    } else {                                                       \
+      return ::NAME(x, y);                                         \
+    }                                                              \
+  }
+#endif
+
+MLX_DEFINE_BINARY_OP(max, __hmax)
+MLX_DEFINE_BINARY_OP(min, __hmin)
+
+#undef MLX_DEFINE_BINARY_OP
+
+template <typename T>
+__forceinline__ __device__ T fmod(T x, T y) {
+  if constexpr (cuda::std::is_same_v<T, __half>) {
+    return __float2half(::fmod(__half2float(x), __half2float(y)));
+#if CUDART_VERSION >= 12000 || __CUDA_ARCH__ >= 800
+  } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
+    return __float2bfloat16(::fmod(__bfloat162float(x), __bfloat162float(y)));
+#endif
+  } else {
+    return ::fmod(x, y);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Additional C++ operator overrides between half types and native types.
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename U>
+constexpr bool is_integral_except =
+    cuda::std::is_integral_v<T> && !cuda::std::is_same_v<T, U>;
+
+template <typename T, typename U>
+constexpr bool is_arithmetic_except =
+    cuda::std::is_arithmetic_v<T> && !cuda::std::is_same_v<T, U>;
+
+#define MLX_DEFINE_HALF_OP(HALF, HALF2FLOAT, FLOAT2HALF, OP)          \
+  template <                                                          \
+      typename T,                                                     \
+      typename = cuda::std::enable_if_t<is_integral_except<T, HALF>>> \
+  __forceinline__ __device__ HALF operator OP(HALF x, T y) {          \
+    return FLOAT2HALF(HALF2FLOAT(x) OP static_cast<float>(y));        \
+  }                                                                   \
+  template <                                                          \
+      typename T,                                                     \
+      typename = cuda::std::enable_if_t<is_integral_except<T, HALF>>> \
+  __forceinline__ __device__ HALF operator OP(T x, HALF y) {          \
+    return FLOAT2HALF(static_cast<float>(x) OP HALF2FLOAT(y));        \
+  }
+
+#define MLX_DEFINE_HALF_CMP(HALF, HALF2FLOAT, OP)                       \
+  template <                                                            \
+      typename T,                                                       \
+      typename = cuda::std::enable_if_t<is_arithmetic_except<T, HALF>>> \
+  __forceinline__ __device__ bool operator OP(HALF x, T y) {            \
+    return HALF2FLOAT(x) OP static_cast<float>(y);                      \
+  }                                                                     \
+  template <                                                            \
+      typename T,                                                       \
+      typename = cuda::std::enable_if_t<is_arithmetic_except<T, HALF>>> \
+  __forceinline__ __device__ bool operator OP(T x, HALF y) {            \
+    return static_cast<float>(y) OP HALF2FLOAT(x);                      \
+  }
+
+MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, +)
+MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, -)
+MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, *)
+MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, /)
+MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, +)
+MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, -)
+MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, *)
+MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, /)
+MLX_DEFINE_HALF_CMP(__half, __half2float, <)
+MLX_DEFINE_HALF_CMP(__half, __half2float, >)
+MLX_DEFINE_HALF_CMP(__half, __half2float, <=)
+MLX_DEFINE_HALF_CMP(__half, __half2float, >=)
+MLX_DEFINE_HALF_CMP(__half, __half2float, ==)
+MLX_DEFINE_HALF_CMP(__half, __half2float, !=)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, <)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, >)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, <=)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, >=)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, ==)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, !=)
+
+#undef MLX_DEFINE_HALF_OP
+#undef MLX_DEFINE_HALF_CMP
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/gather.cuh
+++ b/mlx/backend/cuda/device/gather.cuh
@@ -0,0 +1,53 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device/indexing.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core::cu {
+
+namespace cg = cooperative_groups;
+
+template <typename T, typename IdxT, int NIDX, int IDX_NDIM, typename LocT>
+__global__ void gather(
+    const T* src,
+    T* out,
+    LocT size,
+    const __grid_constant__ Shape src_shape,
+    const __grid_constant__ Strides src_strides,
+    int32_t src_ndim,
+    const __grid_constant__ Shape slice_sizes,
+    uint32_t slice_size,
+    const __grid_constant__ cuda::std::array<int32_t, NIDX> axes,
+    const __grid_constant__ cuda::std::array<IdxT*, NIDX> indices,
+    const __grid_constant__ cuda::std::array<int32_t, NIDX * IDX_NDIM>
+        indices_shape,
+    const __grid_constant__ cuda::std::array<int64_t, NIDX * IDX_NDIM>
+        indices_strides) {
+  LocT out_idx = cg::this_grid().thread_rank();
+  if (out_idx >= size) {
+    return;
+  }
+
+  LocT src_elem = out_idx % slice_size;
+  LocT idx_elem = out_idx / slice_size;
+
+  LocT src_loc =
+      elem_to_loc(src_elem, slice_sizes.data(), src_strides.data(), src_ndim);
+
+#pragma unroll
+  for (int i = 0; i < NIDX; ++i) {
+    LocT idx_loc = elem_to_loc_nd<IDX_NDIM>(
+        idx_elem,
+        indices_shape.data() + i * IDX_NDIM,
+        indices_strides.data() + i * IDX_NDIM);
+    int32_t axis = axes[i];
+    LocT idx_val = absolute_index(indices[i][idx_loc], src_shape[axis]);
+    src_loc += idx_val * src_strides[axis];
+  }
+
+  out[out_idx] = src[src_loc];
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/gather_axis.cuh
+++ b/mlx/backend/cuda/device/gather_axis.cuh
@@ -0,0 +1,65 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device/indexing.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core::cu {
+
+namespace cg = cooperative_groups;
+
+template <
+    typename T,
+    typename IdxT,
+    int NDIM,
+    bool SrcC,
+    bool IdxC,
+    typename LocT>
+__global__ void gather_axis(
+    const T* src,
+    const IdxT* indices,
+    T* out,
+    LocT idx_size_pre,
+    LocT idx_size_axis,
+    LocT idx_size_post,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> src_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> idx_strides,
+    int32_t axis,
+    int32_t axis_size,
+    int64_t src_stride_axis,
+    int64_t idx_stride_axis) {
+  LocT index = cg::this_grid().thread_rank();
+  if (index >= idx_size_pre * idx_size_axis * idx_size_post) {
+    return;
+  }
+
+  auto [x, y, z] = index_to_dims(index, idx_size_axis, idx_size_pre);
+
+  LocT elem_idx = z * idx_size_post;
+
+  LocT idx_loc = y * idx_stride_axis;
+  if constexpr (IdxC) {
+    idx_loc += elem_idx * idx_size_axis + x;
+  } else {
+    idx_loc +=
+        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), idx_strides.data());
+  }
+
+  auto idx_val = absolute_index(indices[idx_loc], axis_size);
+
+  LocT src_loc = idx_val * src_stride_axis;
+  if constexpr (SrcC) {
+    src_loc += elem_idx * axis_size + x;
+  } else {
+    src_loc +=
+        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), src_strides.data());
+  }
+
+  LocT out_idx = y * idx_size_post + elem_idx * idx_size_axis + x;
+
+  out[out_idx] = src[src_loc];
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/indexing.cuh
+++ b/mlx/backend/cuda/device/indexing.cuh
@@ -0,0 +1,30 @@
+// Copyright © 2025 Apple Inc.
+
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+
+namespace mlx::core::cu {
+
+// Convert an absolute index to positions in a 3d grid, assuming the index is
+// calculated with:
+// index = x * dim1 * dim2 + y * dim2 + z
+template <typename T>
+inline __host__ __device__ cuda::std::tuple<T, T, T>
+index_to_dims(T index, T dim1, T dim2) {
+  T x = index / (dim1 * dim2);
+  T y = (index % (dim1 * dim2)) / dim2;
+  T z = index % dim2;
+  return cuda::std::make_tuple(x, y, z);
+}
+
+// Get absolute index from possible negative index.
+template <typename IdxT>
+inline __host__ __device__ auto absolute_index(IdxT idx, int32_t size) {
+  if constexpr (cuda::std::is_unsigned_v<IdxT>) {
+    return idx;
+  } else {
+    return static_cast<int32_t>(idx < 0 ? idx + size : idx);
+  }
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/scatter.cuh
+++ b/mlx/backend/cuda/device/scatter.cuh
@@ -0,0 +1,68 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device/indexing.cuh"
+#include "mlx/backend/cuda/device/scatter_ops.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core::cu {
+
+namespace cg = cooperative_groups;
+
+template <
+    typename T,
+    typename IdxT,
+    typename Op,
+    int NIDX,
+    int IDX_NDIM,
+    typename LocT>
+__global__ void scatter(
+    const T* upd,
+    T* out,
+    LocT size,
+    const __grid_constant__ Shape upd_shape,
+    const __grid_constant__ Strides upd_strides,
+    int32_t upd_ndim,
+    LocT upd_post_idx_size,
+    const __grid_constant__ Shape out_shape,
+    const __grid_constant__ Strides out_strides,
+    int32_t out_ndim,
+    const __grid_constant__ cuda::std::array<int32_t, NIDX> axes,
+    const __grid_constant__ cuda::std::array<IdxT*, NIDX> indices,
+    const __grid_constant__ cuda::std::array<int32_t, NIDX * IDX_NDIM>
+        indices_shape,
+    const __grid_constant__ cuda::std::array<int64_t, NIDX * IDX_NDIM>
+        indices_strides) {
+  LocT upd_idx = cg::this_grid().thread_rank();
+  if (upd_idx >= size) {
+    return;
+  }
+
+  LocT out_elem = upd_idx % upd_post_idx_size;
+  LocT idx_elem = upd_idx / upd_post_idx_size;
+
+  LocT out_idx = elem_to_loc(
+      out_elem, upd_shape.data() + IDX_NDIM, out_strides.data(), out_ndim);
+
+#pragma unroll
+  for (int i = 0; i < NIDX; ++i) {
+    LocT idx_loc = elem_to_loc_nd<IDX_NDIM>(
+        idx_elem,
+        indices_shape.data() + i * IDX_NDIM,
+        indices_strides.data() + i * IDX_NDIM);
+    int32_t axis = axes[i];
+    LocT idx_val = absolute_index(indices[i][idx_loc], out_shape[axis]);
+    out_idx += idx_val * out_strides[axis];
+  }
+
+  LocT upd_loc = elem_to_loc(
+      out_elem + idx_elem * upd_post_idx_size,
+      upd_shape.data(),
+      upd_strides.data(),
+      upd_ndim);
+
+  Op{}(out + out_idx, upd[upd_loc]);
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/scatter_axis.cuh
+++ b/mlx/backend/cuda/device/scatter_axis.cuh
@@ -0,0 +1,67 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device/indexing.cuh"
+#include "mlx/backend/cuda/device/scatter_ops.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core::cu {
+
+namespace cg = cooperative_groups;
+
+template <
+    typename T,
+    typename IdxT,
+    typename Op,
+    int NDIM,
+    bool UpdC,
+    bool IdxC,
+    typename LocT>
+__global__ void scatter_axis(
+    const T* upd,
+    const IdxT* indices,
+    T* out,
+    LocT idx_size_pre,
+    LocT idx_size_axis,
+    LocT idx_size_post,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> upd_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> idx_strides,
+    int32_t axis,
+    int32_t axis_size,
+    int64_t upd_stride_axis,
+    int64_t idx_stride_axis) {
+  LocT index = cg::this_grid().thread_rank();
+  if (index >= idx_size_pre * idx_size_axis * idx_size_post) {
+    return;
+  }
+
+  auto [x, y, z] = index_to_dims(index, idx_size_axis, idx_size_pre);
+
+  LocT elem_idx = z * idx_size_post;
+
+  LocT idx_loc = y * idx_stride_axis;
+  if constexpr (IdxC) {
+    idx_loc += elem_idx * idx_size_axis + x;
+  } else {
+    idx_loc +=
+        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), idx_strides.data());
+  }
+
+  auto idx_val = absolute_index(indices[idx_loc], axis_size);
+
+  LocT upd_loc = y * upd_stride_axis;
+  if constexpr (UpdC) {
+    upd_loc += elem_idx * idx_size_axis + x;
+  } else {
+    upd_loc +=
+        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), upd_strides.data());
+  }
+
+  LocT out_idx = idx_val * idx_size_post + elem_idx * axis_size + x;
+
+  Op{}(out + out_idx, upd[upd_loc]);
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/scatter_ops.cuh
+++ b/mlx/backend/cuda/device/scatter_ops.cuh
@@ -0,0 +1,44 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/device/atomic_ops.cuh"
+
+namespace mlx::core::cu {
+
+struct ScatterAssign {
+  template <typename T>
+  __device__ void operator()(T* out, T val) const {
+    *out = val;
+  }
+};
+
+struct ScatterSum {
+  template <typename T>
+  __device__ void operator()(T* out, T val) const {
+    atomic_add(out, val);
+  }
+};
+
+struct ScatterProd {
+  template <typename T>
+  __device__ void operator()(T* out, T val) const {
+    atomic_prod(out, val);
+  }
+};
+
+struct ScatterMax {
+  template <typename T>
+  __device__ void operator()(T* out, T val) const {
+    atomic_max(out, val);
+  }
+};
+
+struct ScatterMin {
+  template <typename T>
+  __device__ void operator()(T* out, T val) const {
+    atomic_min(out, val);
+  }
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/ternary_ops.cuh
+++ b/mlx/backend/cuda/device/ternary_ops.cuh
@@ -0,0 +1,13 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+
+namespace mlx::core::cu {
+
+struct Select {
+  template <typename T>
+  __device__ T operator()(bool condition, T x, T y) {
+    return condition ? x : y;
+  }
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/unary_ops.cuh
+++ b/mlx/backend/cuda/device/unary_ops.cuh
@@ -0,0 +1,426 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/device/cexpf.cuh"
+#include "mlx/backend/cuda/device/cucomplex_math.cuh"
+#include "mlx/backend/cuda/device/fp16_math.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <math_constants.h>
+
+namespace mlx::core::cu {
+
+struct Abs {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_unsigned_v<T>) {
+      return x;
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {sqrt(cuCrealf(x) * cuCrealf(x) + cuCimagf(x) * cuCimagf(x)), 0};
+    } else {
+      return abs(x);
+    }
+  }
+};
+
+struct ArcCos {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return acos(x);
+  }
+
+  __device__ cuComplex operator()(cuComplex x);
+};
+
+struct ArcCosh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return acosh(x);
+  }
+};
+
+struct ArcSin {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return asin(x);
+  }
+
+  __device__ cuComplex operator()(cuComplex x);
+};
+
+struct ArcSinh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return asinh(x);
+  }
+};
+
+struct ArcTan {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return atan(x);
+  }
+
+  __device__ cuComplex operator()(cuComplex x);
+};
+
+struct ArcTanh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return atanh(x);
+  }
+};
+
+struct BitwiseInvert {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return ~x;
+  }
+};
+
+struct Ceil {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      return x;
+    } else {
+      return ceil(x);
+    }
+  }
+};
+
+struct Conjugate {
+  __device__ cuComplex operator()(cuComplex x) {
+    return {cuCrealf(x), -cuCimagf(x)};
+  }
+};
+
+struct Cos {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {
+          cos(cuCrealf(x)) * cosh(cuCimagf(x)),
+          -sin(cuCrealf(x)) * sinh(cuCimagf(x))};
+    } else {
+      return cos(x);
+    }
+  }
+};
+
+struct Cosh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {
+          cosh(cuCrealf(x)) * cos(cuCimagf(x)),
+          sinh(cuCrealf(x)) * sin(cuCimagf(x))};
+    } else {
+      return cosh(x);
+    }
+  }
+};
+
+struct Erf {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, __half>) {
+      return erf(__half2float(x));
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
+      return erf(__bfloat162float(x));
+    } else {
+      return erf(x);
+    }
+  }
+};
+
+struct ErfInv {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, __half>) {
+      return erfinv(__half2float(x));
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
+      return erfinv(__bfloat162float(x));
+    } else {
+      return erfinv(x);
+    }
+  }
+};
+
+struct Exp {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return detail::cexpf(x);
+    } else {
+      return exp(x);
+    }
+  }
+};
+
+struct Expm1 {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, __half>) {
+      return expm1(__half2float(x));
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
+      return expm1(__bfloat162float(x));
+    } else {
+      return expm1(x);
+    }
+  }
+};
+
+struct Floor {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      return x;
+    } else {
+      return floor(x);
+    }
+  }
+};
+
+struct Imag {
+  __device__ float operator()(cuComplex x) {
+    return cuCimagf(x);
+  }
+};
+
+struct Log {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      auto r = log(cuCrealf(Abs{}(x)));
+      auto i = atan2f(cuCimagf(x), cuCrealf(x));
+      return {r, i};
+    } else {
+      return log(x);
+    }
+  }
+};
+
+struct Log2 {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      auto y = Log{}(x);
+      return {cuCrealf(y) / CUDART_LN2_F, cuCimagf(y) / CUDART_LN2_F};
+    } else {
+      return log2(x);
+    }
+  }
+};
+
+struct Log10 {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      auto y = Log{}(x);
+      return {cuCrealf(y) / CUDART_LNT_F, cuCimagf(y) / CUDART_LNT_F};
+      return y;
+    } else {
+      return log10(x);
+    }
+  }
+};
+
+struct Log1p {
+  template <typename T>
+  __device__ T operator()(T z) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      float x = cuCrealf(z);
+      float y = cuCimagf(z);
+      float zabs = cuCrealf(Abs{}(z));
+      float theta = atan2f(y, x + 1);
+      if (zabs < 0.5f) {
+        float r = x * (2 + x) + y * y;
+        if (r == 0) { // handle underflow
+          return {x, theta};
+        }
+        return {0.5f * log1pf(r), theta};
+      } else {
+        float z0 = hypotf(x + 1, y);
+        return {logf(z0), theta};
+      }
+    } else {
+      return log1p(z);
+    }
+  }
+};
+
+struct LogicalNot {
+  __device__ bool operator()(bool x) {
+    return !x;
+  }
+};
+
+struct Negative {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return 0 - x;
+    } else {
+      return -x;
+    }
+  }
+};
+
+struct Real {
+  __device__ float operator()(cuComplex x) {
+    return cuCrealf(x);
+  }
+};
+
+struct Round {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {rint(cuCrealf(x)), rint(cuCimagf(x))};
+    } else {
+      return rint(x);
+    }
+  }
+};
+
+struct Sigmoid {
+  template <typename T>
+  __device__ T operator()(T x) {
+    T y = 1 / (1 + exp(-abs(x)));
+    return (x < 0) ? 1 - y : y;
+  }
+};
+
+struct Sign {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_unsigned_v<T>) {
+      return x != 0;
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      if (cuCrealf(x) == 0 && cuCimagf(x) == 0) {
+        return x;
+      } else {
+        return x / Abs()(x);
+      }
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
+      return static_cast<float>((x > T(0.f)) - (x < T(0.f)));
+    } else {
+      return (x > T(0)) - (x < T(0));
+    }
+  }
+};
+
+struct Sin {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {
+          sin(cuCrealf(x)) * cosh(cuCimagf(x)),
+          cos(cuCrealf(x)) * sinh(cuCimagf(x))};
+    } else {
+      return sin(x);
+    }
+  }
+};
+
+struct Sinh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {
+          sinh(cuCrealf(x)) * cos(cuCimagf(x)),
+          cosh(cuCrealf(x)) * sin(cuCimagf(x))};
+    } else {
+      return sinh(x);
+    }
+  }
+};
+
+struct Square {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return x * x;
+  }
+};
+
+struct Sqrt {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return sqrt(x);
+  }
+
+  __device__ cuComplex operator()(cuComplex x) {
+    auto xr = cuCrealf(x);
+    auto xi = cuCimagf(x);
+    if (xr == 0.0f && xi == 0.0f) {
+      return {0.0f, 0.0f};
+    }
+    auto r = cuCrealf(Abs{}(x));
+    auto a = sqrt((r + xr) / 2.0f);
+    auto b_abs = sqrt((r - xr) / 2.0f);
+    auto b = copysign(b_abs, xi);
+    return {a, b};
+  }
+};
+
+struct Rsqrt {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return rsqrt(x);
+  }
+  __device__ cuComplex operator()(cuComplex x) {
+    return 1.0f / Sqrt{}(x);
+  }
+};
+
+struct Tan {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      float tan_a = tan(cuCrealf(x));
+      float tanh_b = tanh(cuCimagf(x));
+      float t1 = tan_a * tanh_b;
+      float denom = 1. + t1 * t1;
+      return {(tan_a - tanh_b * t1) / denom, (tanh_b + tan_a * t1) / denom};
+    } else {
+      return tan(x);
+    }
+  }
+};
+
+struct Tanh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      float tanh_a = tanh(cuCrealf(x));
+      float tan_b = tan(cuCimagf(x));
+      float t1 = tanh_a * tan_b;
+      float denom = 1. + t1 * t1;
+      return {(tanh_a + tan_b * t1) / denom, (tan_b - tanh_a * t1) / denom};
+    } else {
+      return tanh(x);
+    }
+  }
+};
+
+inline __device__ cuComplex ArcCos::operator()(cuComplex x) {
+  auto i = cuComplex{0.0, 1.0};
+  auto y = Log{}(x + i * Sqrt{}(1.0 - x * x));
+  return {cuCimagf(y), -cuCrealf(y)};
+};
+
+inline __device__ cuComplex ArcSin::operator()(cuComplex x) {
+  auto i = cuComplex{0.0f, 1.0f};
+  auto y = Log{}(i * x + Sqrt{}(1.0f - x * x));
+  return {cuCimagf(y), -cuCrealf(y)};
+};
+
+inline __device__ cuComplex ArcTan::operator()(cuComplex x) {
+  auto i = cuComplex{0.0f, 1.0f};
+  auto ix = i * x;
+  return (1.0f / cuComplex{0.0f, 2.0f}) * Log{}((1.0f + ix) / (1.0f - ix));
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/utils.cuh
+++ b/mlx/backend/cuda/device/utils.cuh
@@ -0,0 +1,362 @@
+// Copyright © 2025 Apple Inc.
+
+// This file must not include any host-only code, utilies that work under both
+// host and device can be put here.
+//
+// See more about the requirements at:
+// https://docs.nvidia.com/cuda/nvrtc/#language
+
+#pragma once
+
+#include "mlx/backend/cuda/device/config.h"
+
+#include <cuComplex.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda/std/array>
+#include <cuda/std/limits>
+#include <cuda/std/tuple>
+
+namespace mlx::core::cu {
+
+///////////////////////////////////////////////////////////////////////////////
+// CUDA kernel utils
+///////////////////////////////////////////////////////////////////////////////
+
+// To pass shape/strides to kernels via constant memory, their size must be
+// known at compile time.
+using Shape = cuda::std::array<int32_t, MAX_NDIM>;
+using Strides = cuda::std::array<int64_t, MAX_NDIM>;
+
+// Vectorized load/store.
+template <typename T, int N>
+struct alignas(sizeof(T) * N) AlignedVector {
+  T val[N];
+};
+
+template <int N, typename T>
+inline __device__ AlignedVector<T, N> load_vector(
+    const T* ptr,
+    uint32_t offset) {
+  auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
+  return from[offset];
+}
+
+template <int N, typename T>
+inline __device__ void
+store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
+  auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
+  to[offset] = vec;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Type limits utils
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename = void>
+struct Limits {
+  static constexpr __host__ __device__ T max() {
+    return cuda::std::numeric_limits<T>::max();
+  }
+  static constexpr __host__ __device__ T min() {
+    return cuda::std::numeric_limits<T>::min();
+  }
+  static constexpr __host__ __device__ T finite_max() {
+    return cuda::std::numeric_limits<T>::max();
+  }
+  static constexpr __host__ __device__ T finite_min() {
+    return cuda::std::numeric_limits<T>::min();
+  }
+};
+
+template <typename T>
+struct Limits<
+    T,
+    cuda::std::enable_if_t<
+        cuda::std::is_same_v<T, float> || cuda::std::is_same_v<T, double>>> {
+  static constexpr __host__ __device__ T max() {
+    return cuda::std::numeric_limits<T>::infinity();
+  }
+  static constexpr __host__ __device__ T min() {
+    return -cuda::std::numeric_limits<T>::infinity();
+  }
+  static constexpr __host__ __device__ T finite_max() {
+    return cuda::std::numeric_limits<T>::max();
+  }
+  static constexpr __host__ __device__ T finite_min() {
+    return cuda::std::numeric_limits<T>::lowest();
+  }
+};
+
+// CUDA 11 does not have host side arithmatic operators for half types.
+template <typename T>
+struct Limits<
+    T,
+    cuda::std::enable_if_t<
+        cuda::std::is_same_v<T, __half> ||
+        cuda::std::is_same_v<T, __nv_bfloat16>>> {
+  static constexpr __host__ __device__ T max() {
+    return cuda::std::numeric_limits<T>::infinity();
+  }
+  static constexpr __host__ __device__ T min() {
+#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
+    return -cuda::std::numeric_limits<float>::infinity();
+#else
+    return -cuda::std::numeric_limits<T>::infinity();
+#endif
+  }
+  static constexpr __host__ __device__ T finite_max() {
+    return cuda::std::numeric_limits<T>::max();
+  }
+  static constexpr __host__ __device__ T finite_min() {
+#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
+    return cuda::std::numeric_limits<float>::lowest();
+#else
+    return cuda::std::numeric_limits<T>::lowest();
+#endif
+  }
+};
+
+template <>
+struct Limits<bool> {
+  static constexpr __host__ __device__ bool max() {
+    return true;
+  }
+  static constexpr __host__ __device__ bool min() {
+    return false;
+  }
+};
+
+template <>
+struct Limits<cuComplex> {
+  static constexpr __host__ __device__ cuComplex max() {
+    return {Limits<float>::max(), Limits<float>::max()};
+  }
+  static constexpr __host__ __device__ cuComplex min() {
+    return {Limits<float>::min(), Limits<float>::min()};
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Indexing utils
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename IdxT = int64_t>
+inline __host__ __device__ IdxT
+elem_to_loc(IdxT elem, const int* shape, const int64_t* strides, int ndim) {
+  IdxT loc = 0;
+  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
+    loc += (elem % shape[i]) * IdxT(strides[i]);
+    elem /= shape[i];
+  }
+  return loc;
+}
+
+// Optimize when the ndim is known at compile time.
+template <int NDIM, typename IdxT = int64_t>
+inline __host__ __device__ IdxT
+elem_to_loc_nd(IdxT elem, const int* shape, const int64_t* strides) {
+  IdxT loc = 0;
+#pragma unroll
+  for (int i = NDIM - 1; i >= 0; --i) {
+    loc += (elem % shape[i]) * IdxT(strides[i]);
+    elem /= shape[i];
+  }
+  return loc;
+}
+
+template <int NDIM, typename IdxT = int64_t>
+inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc_nd(
+    IdxT elem,
+    const int* shape,
+    const int64_t* a_strides,
+    const int64_t* b_strides) {
+  IdxT a_loc = 0;
+  IdxT b_loc = 0;
+#pragma unroll
+  for (int i = NDIM - 1; i >= 0; --i) {
+    int dim_idx = elem % shape[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
+    elem /= shape[i];
+  }
+  return cuda::std::make_tuple(a_loc, b_loc);
+}
+
+template <int NDIM, typename IdxT = int64_t>
+inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_nd(
+    IdxT elem,
+    const int* shape,
+    const int64_t* a_strides,
+    const int64_t* b_strides,
+    const int64_t* c_strides) {
+  IdxT a_loc = 0;
+  IdxT b_loc = 0;
+  IdxT c_loc = 0;
+#pragma unroll
+  for (int i = NDIM - 1; i >= 0; --i) {
+    int dim_idx = elem % shape[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
+    c_loc += dim_idx * IdxT(c_strides[i]);
+    elem /= shape[i];
+  }
+  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
+}
+
+// Optimized version when ndim is larger than 4.
+template <typename IdxT = int64_t>
+inline __host__ __device__ IdxT
+elem_to_loc_4d(IdxT elem, const int* shape, const int64_t* strides, int ndim) {
+  IdxT loc = 0;
+  for (int i = ndim - 1; i >= 0; --i) {
+    loc += (elem % shape[i]) * IdxT(strides[i]);
+    elem /= shape[i];
+  }
+  return loc;
+}
+
+template <typename IdxT = int64_t>
+inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc_4d(
+    IdxT elem,
+    const int* shape,
+    const int64_t* a_strides,
+    const int64_t* b_strides,
+    int ndim) {
+  IdxT a_loc = 0;
+  IdxT b_loc = 0;
+  for (int i = ndim - 1; i >= 0; --i) {
+    int dim_idx = elem % shape[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
+    elem /= shape[i];
+  }
+  return cuda::std::make_tuple(a_loc, b_loc);
+}
+
+template <typename IdxT = int64_t>
+inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_4d(
+    IdxT elem,
+    const int* shape,
+    const int64_t* a_strides,
+    const int64_t* b_strides,
+    const int64_t* c_strides,
+    int ndim) {
+  IdxT a_loc = 0;
+  IdxT b_loc = 0;
+  IdxT c_loc = 0;
+  for (int i = ndim - 1; i >= 0; --i) {
+    int dim_idx = elem % shape[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
+    c_loc += dim_idx * IdxT(c_strides[i]);
+    elem /= shape[i];
+  }
+  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Elem to loc in a loop utils
+///////////////////////////////////////////////////////////////////////////////
+
+template <int DIM, bool General = true, typename OffsetT = size_t>
+struct LoopedElemToLoc {
+  int dim;
+  LoopedElemToLoc<DIM - 1, General, OffsetT> inner_looper;
+  OffsetT offset{0};
+  int index{0};
+
+  __device__ LoopedElemToLoc(int dim) : dim(dim), inner_looper(dim - 1) {}
+
+  __device__ void next(const int* shape, const int64_t* strides) {
+    if (dim == 0) {
+      return;
+    }
+    index++;
+    offset += OffsetT(strides[dim - 1]);
+    if (index >= shape[dim - 1]) {
+      index = 0;
+      inner_looper.next(shape, strides);
+      offset = inner_looper.offset;
+    }
+  }
+
+  __device__ void next(int n, const int* shape, const int64_t* strides) {
+    if (dim == 0) {
+      return;
+    }
+    index += n;
+    offset += n * OffsetT(strides[dim - 1]);
+
+    if (index >= shape[dim - 1]) {
+      int extra = index - shape[dim - 1];
+      if (extra >= shape[dim - 1]) {
+        inner_looper.next(1 + extra / shape[dim - 1], shape, strides);
+        extra = extra % shape[dim - 1];
+      } else {
+        inner_looper.next(shape, strides);
+      }
+      index = 0;
+      offset = inner_looper.offset;
+      if (extra > 0) {
+        next(extra, shape, strides);
+      }
+    }
+  }
+
+  __device__ OffsetT location() {
+    return offset;
+  }
+};
+
+template <typename OffsetT>
+struct LoopedElemToLoc<1, true, OffsetT> {
+  int dim;
+  OffsetT offset{0};
+  int index{0};
+
+  __device__ LoopedElemToLoc(int dim) : dim(dim) {}
+
+  __device__ void next(const int* shape, const int64_t* strides) {
+    index++;
+    if (dim > 1) {
+      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
+    } else {
+      offset += OffsetT(strides[0]);
+    }
+  }
+
+  __device__ void next(int n, const int* shape, const int64_t* strides) {
+    index += n;
+    if (dim > 1) {
+      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
+    } else {
+      offset = index * OffsetT(strides[0]);
+    }
+  }
+
+  __device__ OffsetT location() {
+    return offset;
+  }
+};
+
+template <typename OffsetT>
+struct LoopedElemToLoc<1, false, OffsetT> {
+  OffsetT offset{0};
+
+  __device__ LoopedElemToLoc(int) {}
+
+  __device__ void next(const int*, const int64_t* strides) {
+    offset += OffsetT(strides[0]);
+  }
+
+  __device__ void next(int n, const int*, const int64_t* strides) {
+    offset += n * OffsetT(strides[0]);
+  }
+
+  __device__ OffsetT location() {
+    return offset;
+  }
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@@ -0,0 +1,66 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/gpu/eval.h"
+#include "mlx/backend/cuda/allocator.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/gpu/available.h"
+#include "mlx/primitives.h"
+
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core::gpu {
+
+bool is_available() {
+  return true;
+}
+
+void new_stream(Stream s) {
+  // Force initalization of cuda, so cuda runtime get destroyed at last.
+  cudaFree(nullptr);
+  // Ensure the static stream objects get created.
+  cu::get_command_encoder(s);
+  // The main thread is safe to free buffers.
+  cu::allocator().register_this_thread();
+}
+
+void eval(array& arr) {
+  nvtx3::scoped_range r("gpu::eval");
+  auto outputs = arr.outputs();
+  {
+    // If the array is a tracer hold a reference
+    // to its inputs so they don't get donated
+    std::vector<array> inputs;
+    if (arr.is_tracer()) {
+      inputs = arr.inputs();
+    }
+    arr.primitive().eval_gpu(arr.inputs(), outputs);
+  }
+
+  auto& encoder = cu::get_command_encoder(arr.primitive().stream());
+  // Keep used buffers alive until kernel finishes running.
+  std::unordered_set<std::shared_ptr<array::Data>> buffers;
+  for (auto& in : arr.inputs()) {
+    buffers.insert(in.data_shared_ptr());
+  }
+  for (auto& s : arr.siblings()) {
+    buffers.insert(s.data_shared_ptr());
+  }
+  // Remove the output if it was donated to by an input.
+  if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
+    buffers.erase(it);
+  }
+  encoder.add_completed_handler([buffers = std::move(buffers)]() {});
+  encoder.maybe_commit();
+}
+
+void finalize(Stream s) {
+  nvtx3::scoped_range r("gpu::finalize");
+  cu::get_command_encoder(s).commit();
+}
+
+void synchronize(Stream s) {
+  nvtx3::scoped_range r("gpu::synchronize");
+  cu::get_command_encoder(s).synchronize();
+}
+
+} // namespace mlx::core::gpu
--- a/mlx/backend/cuda/event.cu
+++ b/mlx/backend/cuda/event.cu
@@ -0,0 +1,269 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/cuda/allocator.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/event.h"
+#include "mlx/backend/cuda/utils.h"
+#include "mlx/event.h"
+#include "mlx/scheduler.h"
+
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace cu {
+
+///////////////////////////////////////////////////////////////////////////////
+// CudaEvent implementations
+///////////////////////////////////////////////////////////////////////////////
+
+// Cuda event managed with RAII.
+class CudaEventHandle {
+ public:
+  CudaEventHandle() {
+    CHECK_CUDA_ERROR(cudaEventCreateWithFlags(
+        &event_, cudaEventDisableTiming | cudaEventBlockingSync));
+  }
+
+  ~CudaEventHandle() {
+    CHECK_CUDA_ERROR(cudaEventDestroy(event_));
+  }
+
+  CudaEventHandle(const CudaEventHandle&) = delete;
+  CudaEventHandle& operator=(const CudaEventHandle&) = delete;
+
+  operator cudaEvent_t() const {
+    return event_;
+  }
+
+ private:
+  cudaEvent_t event_;
+};
+
+CudaEvent::CudaEvent() : event_(std::make_shared<CudaEventHandle>()) {}
+
+void CudaEvent::wait() {
+  nvtx3::scoped_range r("cu::CudaEvent::wait");
+  if (!recorded_) {
+    throw std::runtime_error("Should not wait on a CudaEvent before record.");
+  }
+  cudaEventSynchronize(*event_);
+}
+
+void CudaEvent::wait(cudaStream_t stream) {
+  if (!recorded_) {
+    throw std::runtime_error("Should not wait on a CudaEvent before record.");
+  }
+  cudaStreamWaitEvent(stream, *event_);
+}
+
+void CudaEvent::wait(Stream s) {
+  if (s.device == mlx::core::Device::cpu) {
+    scheduler::enqueue(s, [*this]() mutable { wait(); });
+  } else {
+    auto& enc = cu::get_command_encoder(s);
+    enc.commit();
+    wait(enc.stream());
+  }
+}
+
+void CudaEvent::record(cudaStream_t stream) {
+  cudaEventRecord(*event_, stream);
+  recorded_ = true;
+}
+
+void CudaEvent::record(Stream s) {
+  if (s.device == mlx::core::Device::cpu) {
+    throw std::runtime_error("CudaEvent can not wait on cpu stream.");
+  } else {
+    auto& enc = cu::get_command_encoder(s);
+    enc.commit();
+    record(enc.stream());
+  }
+}
+
+bool CudaEvent::completed() const {
+  return cudaEventQuery(*event_) == cudaSuccess;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// SharedEvent implementations
+///////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+__host__ __device__ void event_wait(SharedEvent::Atomic* ac, uint64_t value) {
+  uint64_t current;
+  while ((current = ac->load()) < value) {
+    ac->wait(current);
+  }
+}
+
+__host__ __device__ void event_signal(SharedEvent::Atomic* ac, uint64_t value) {
+  ac->store(value);
+  ac->notify_all();
+}
+
+__global__ void event_wait_kernel(SharedEvent::Atomic* ac, uint64_t value) {
+  event_wait(ac, value);
+}
+
+__global__ void event_signal_kernel(SharedEvent::Atomic* ac, uint64_t value) {
+  event_signal(ac, value);
+}
+
+} // namespace
+
+SharedEvent::SharedEvent() {
+  // Allocate cuda::atomic on managed memory.
+  Atomic* ac;
+  CHECK_CUDA_ERROR(cudaMallocManaged(&ac, sizeof(Atomic)));
+  new (ac) Atomic(0);
+  ac_ = std::shared_ptr<Atomic>(ac, [](Atomic* ptr) {
+    ptr->~Atomic();
+    allocator().cuda_free(ptr);
+  });
+}
+
+void SharedEvent::wait(uint64_t value) {
+  nvtx3::scoped_range r("cu::SharedEvent::wait");
+  event_wait(ac_.get(), value);
+}
+
+void SharedEvent::wait(cudaStream_t stream, uint64_t value) {
+  event_wait_kernel<<<1, 1, 0, stream>>>(ac_.get(), value);
+}
+
+void SharedEvent::wait(Stream s, uint64_t value) {
+  nvtx3::scoped_range r("cu::SharedEvent::wait(s)");
+  if (s.device == mlx::core::Device::cpu) {
+    scheduler::enqueue(s, [*this, value]() mutable { wait(value); });
+  } else {
+    auto& encoder = get_command_encoder(s);
+    encoder.commit();
+    wait(encoder.stream(), value);
+    encoder.add_completed_handler([ac = ac_]() {});
+  }
+}
+
+void SharedEvent::signal(uint64_t value) {
+  nvtx3::scoped_range r("cu::SharedEvent::signal");
+  event_signal(ac_.get(), value);
+}
+
+void SharedEvent::signal(cudaStream_t stream, uint64_t value) {
+  event_signal_kernel<<<1, 1, 0, stream>>>(ac_.get(), value);
+}
+
+void SharedEvent::signal(Stream s, uint64_t value) {
+  nvtx3::scoped_range r("cu::SharedEvent::signal(s)");
+  if (s.device == mlx::core::Device::cpu) {
+    // Signal through a GPU stream so the atomic is updated in GPU - updating
+    // the atomic in CPU sometimes does not get GPU notified.
+    static CudaStream stream(device(mlx::core::Device::gpu));
+    scheduler::enqueue(s, [*this, value]() mutable { signal(stream, value); });
+  } else {
+    auto& encoder = get_command_encoder(s);
+    encoder.commit();
+    signal(encoder.stream(), value);
+    encoder.add_completed_handler([ac = ac_]() {});
+  }
+}
+
+bool SharedEvent::is_signaled(uint64_t value) const {
+  nvtx3::scoped_range r("cu::SharedEvent::is_signaled");
+  return ac_->load() >= value;
+}
+
+uint64_t SharedEvent::value() const {
+  nvtx3::scoped_range r("cu::SharedEvent::value");
+  return ac_->load();
+}
+
+} // namespace cu
+
+///////////////////////////////////////////////////////////////////////////////
+// Event implementations
+///////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+struct EventImpl {
+  // CudaEvent is preferred when possible because it is fast, however we have
+  // to fallback to SharedEvent in following cases:
+  // 1. the event is used to wait/signal a cpu stream;
+  // 2. signal value other than 1 has been specified.
+  std::unique_ptr<cu::CudaEvent> cuda;
+  std::unique_ptr<cu::SharedEvent> shared;
+
+  bool is_created() const {
+    return cuda || shared;
+  }
+
+  void ensure_created(Stream s, uint64_t signal_value) {
+    if (is_created()) {
+      return;
+    }
+    if (s.device == mlx::core::Device::cpu || signal_value > 1) {
+      nvtx3::mark("Using slow SharedEvent");
+      shared = std::make_unique<cu::SharedEvent>();
+    } else {
+      cuda = std::make_unique<cu::CudaEvent>();
+    }
+  }
+};
+
+} // namespace
+
+Event::Event(Stream s) : stream_(s) {
+  event_ = std::shared_ptr<void>(
+      new EventImpl(), [](void* ptr) { delete static_cast<EventImpl*>(ptr); });
+}
+
+void Event::wait() {
+  auto* event = static_cast<EventImpl*>(event_.get());
+  assert(event->is_created());
+  if (event->cuda) {
+    assert(value() == 1);
+    event->cuda->wait();
+  } else {
+    event->shared->wait(value());
+  }
+}
+
+void Event::wait(Stream s) {
+  auto* event = static_cast<EventImpl*>(event_.get());
+  assert(event->is_created());
+  if (event->cuda) {
+    assert(value() == 1);
+    event->cuda->wait(s);
+  } else {
+    event->shared->wait(s, value());
+  }
+}
+
+void Event::signal(Stream s) {
+  auto* event = static_cast<EventImpl*>(event_.get());
+  event->ensure_created(s, value());
+  if (event->cuda) {
+    assert(value() == 1);
+    event->cuda->record(s);
+  } else {
+    event->shared->signal(s, value());
+  }
+}
+
+bool Event::is_signaled() const {
+  auto* event = static_cast<EventImpl*>(event_.get());
+  if (!event->is_created()) {
+    return false;
+  }
+  if (event->cuda) {
+    assert(value() == 1);
+    return event->cuda->recorded() && event->cuda->completed();
+  } else {
+    return event->shared->is_signaled(value());
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/event.h
+++ b/mlx/backend/cuda/event.h
@@ -0,0 +1,66 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/stream.h"
+
+#include <cuda_runtime.h>
+#include <cuda/atomic>
+
+#include <memory>
+
+namespace mlx::core::cu {
+
+class CudaEventHandle;
+
+// Wrapper of native cuda event. It can synchronize between GPU streams, or wait
+// on GPU stream in CPU stream, but can not wait on CPU stream.
+class CudaEvent {
+ public:
+  CudaEvent();
+
+  void wait();
+  void wait(cudaStream_t stream);
+  void wait(Stream s);
+  void record(cudaStream_t stream);
+  void record(Stream s);
+
+  // Return whether the recorded kernels have completed. Note that this method
+  // returns true if record() has not been called.
+  bool completed() const;
+
+  bool recorded() const {
+    return recorded_;
+  }
+
+ private:
+  bool recorded_{false};
+  std::shared_ptr<CudaEventHandle> event_;
+};
+
+// Event that can synchronize between CPU and GPU. It is much slower than
+// CudaEvent so the latter should always be preferred when possible.
+class SharedEvent {
+ public:
+  using Atomic = cuda::atomic<uint64_t>;
+
+  SharedEvent();
+
+  void wait(uint64_t value);
+  void wait(cudaStream_t stream, uint64_t value);
+  void wait(Stream s, uint64_t value);
+  void signal(uint64_t value);
+  void signal(cudaStream_t stream, uint64_t value);
+  void signal(Stream s, uint64_t value);
+  bool is_signaled(uint64_t value) const;
+  uint64_t value() const;
+
+  const std::shared_ptr<Atomic>& atomic() const {
+    return ac_;
+  }
+
+ private:
+  std::shared_ptr<Atomic> ac_;
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/fence.cpp
+++ b/mlx/backend/cuda/fence.cpp
@@ -0,0 +1,29 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/fence.h"
+#include "mlx/backend/cuda/event.h"
+
+namespace mlx::core {
+
+struct FenceImpl {
+  uint32_t count;
+  cu::SharedEvent event;
+};
+
+Fence::Fence(Stream s) {
+  fence_ = std::shared_ptr<void>(
+      new FenceImpl{0}, [](void* ptr) { delete static_cast<FenceImpl*>(ptr); });
+}
+
+void Fence::wait(Stream s, const array&) {
+  auto* fence = static_cast<FenceImpl*>(fence_.get());
+  fence->event.wait(fence->count);
+}
+
+void Fence::update(Stream s, const array&) {
+  auto* fence = static_cast<FenceImpl*>(fence_.get());
+  fence->count++;
+  fence->event.signal(s, fence->count);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@@ -0,0 +1,428 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/compiled.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/jit_module.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/backend/gpu/copy.h"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include "cuda_jit_sources.h"
+
+#include <cuda.h>
+#include <fmt/format.h>
+#include <nvrtc.h>
+#include <nvtx3/nvtx3.hpp>
+
+#include <cassert>
+#include <numeric>
+
+namespace mlx::core {
+
+namespace {
+
+constexpr const char* g_scatter_ops[] = {"Max", "Min", "Sum", "Prod", "Assign"};
+
+void append_indices_arg(
+    cu::KernelArgs& args,
+    const std::vector<array>& inputs,
+    int nidx,
+    int idx_ndim) {
+  std::vector<const void*> indices(nidx);
+  for (int i = 0; i < nidx; ++i) {
+    indices[i] = inputs[i + 1].data<void>();
+  }
+  args.append(std::move(indices));
+  std::vector<int32_t> indices_shape(nidx * idx_ndim);
+  for (int i = 0; i < nidx; ++i) {
+    std::copy_n(
+        inputs[i + 1].shape().begin(),
+        idx_ndim,
+        indices_shape.data() + i * idx_ndim);
+  }
+  args.append(std::move(indices_shape));
+  std::vector<int64_t> indices_strides(nidx * idx_ndim);
+  for (int i = 0; i < nidx; ++i) {
+    std::copy_n(
+        inputs[i + 1].strides().begin(),
+        idx_ndim,
+        indices_strides.data() + i * idx_ndim);
+  }
+  args.append(std::move(indices_strides));
+}
+
+} // namespace
+
+void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Gather::eval_gpu");
+  assert(inputs.size() > 0);
+  const auto& src = inputs[0];
+
+  out.set_data(allocator::malloc(out.nbytes()));
+  if (out.size() == 0) {
+    return;
+  }
+
+  int nidx = inputs.size() - 1;
+  Dtype idx_dtype = nidx > 0 ? inputs[1].dtype() : int32;
+  int32_t idx_ndim = nidx > 0 ? inputs[1].ndim() : 0;
+
+  bool large = (nidx > 0 && inputs[1].size() > INT32_MAX) ||
+      (src.size() > INT32_MAX) || (out.size() > INT32_MAX);
+
+  uint32_t slice_size = std::accumulate(
+      slice_sizes_.begin(), slice_sizes_.end(), 1, std::multiplies<uint32_t>());
+
+  std::string module_name = fmt::format(
+      "gather_{}_{}_{}",
+      dtype_to_string(out.dtype()),
+      dtype_to_string(idx_dtype),
+      nidx);
+
+  auto& s = stream();
+  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
+    std::vector<std::string> kernel_names;
+    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
+      for (int large = 0; large <= 1; ++large) {
+        kernel_names.push_back(fmt::format(
+            "mlx::core::cu::gather<{}, {}, {}, {}, {}>",
+            dtype_to_cuda_type(out.dtype()),
+            dtype_to_cuda_type(idx_dtype),
+            nidx,
+            ndim,
+            large ? "int64_t" : "int32_t"));
+      }
+    }
+    return std::make_pair(jit_source_gather, std::move(kernel_names));
+  });
+
+  cu::KernelArgs args;
+  args.append(src);
+  args.append(out);
+  if (large) {
+    args.append<int64_t>(out.size());
+  } else {
+    args.append<int32_t>(out.size());
+  }
+  args.append_ndim(src.shape());
+  args.append_ndim(src.strides());
+  args.append<int32_t>(src.ndim());
+  args.append_ndim(slice_sizes_);
+  args.append(slice_size);
+  args.append(axes_);
+  append_indices_arg(args, inputs, nidx, idx_ndim);
+
+  std::string kernel_name = fmt::format(
+      "mlx::core::cu::gather<{}, {}, {}, {}, {}>",
+      dtype_to_cuda_type(out.dtype()),
+      dtype_to_cuda_type(idx_dtype),
+      nidx,
+      idx_ndim,
+      large ? "int64_t" : "int32_t");
+
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  encoder.set_output_array(out);
+
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+}
+
+void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Gather::eval_gpu");
+  assert(inputs.size() > 1);
+  auto& upd = inputs.back();
+
+  // Copy src into out.
+  CopyType copy_type;
+  if (inputs[0].data_size() == 1) {
+    copy_type = CopyType::Scalar;
+  } else if (inputs[0].flags().row_contiguous) {
+    copy_type = CopyType::Vector;
+  } else {
+    copy_type = CopyType::General;
+  }
+  copy_gpu(inputs[0], out, copy_type);
+
+  // Empty update.
+  if (upd.size() == 0) {
+    return;
+  }
+
+  int nidx = axes_.size();
+  Dtype idx_dtype = nidx > 0 ? inputs[1].dtype() : int32;
+  int32_t idx_ndim = nidx > 0 ? inputs[1].ndim() : 0;
+
+  bool large = (nidx > 0 && inputs[1].size() > INT32_MAX) ||
+      (upd.size() > INT32_MAX) || (out.size() > INT32_MAX);
+
+  int32_t upd_post_idx_size = std::accumulate(
+      upd.shape().begin() + idx_ndim,
+      upd.shape().end(),
+      1,
+      std::multiplies<int32_t>());
+
+  const char* op = g_scatter_ops[reduce_type_];
+  std::string module_name = fmt::format(
+      "scatter_{}_{}_{}_{}",
+      dtype_to_string(out.dtype()),
+      dtype_to_string(idx_dtype),
+      op,
+      nidx);
+
+  auto& s = stream();
+  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
+    std::vector<std::string> kernel_names;
+    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
+      for (int large = 0; large <= 1; ++large) {
+        kernel_names.push_back(fmt::format(
+            "mlx::core::cu::scatter<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}>",
+            dtype_to_cuda_type(out.dtype()),
+            dtype_to_cuda_type(idx_dtype),
+            op,
+            nidx,
+            ndim,
+            large ? "int64_t" : "int32_t"));
+      }
+    }
+    return std::make_pair(jit_source_scatter, std::move(kernel_names));
+  });
+
+  cu::KernelArgs args;
+  args.append(upd);
+  args.append(out);
+  if (large) {
+    args.append<int64_t>(upd.size());
+  } else {
+    args.append<int32_t>(upd.size());
+  }
+  args.append_ndim(upd.shape());
+  args.append_ndim(upd.strides());
+  args.append<int32_t>(upd.ndim());
+  if (large) {
+    args.append<int64_t>(upd_post_idx_size);
+  } else {
+    args.append<int32_t>(upd_post_idx_size);
+  }
+  args.append_ndim(out.shape());
+  args.append_ndim(out.strides());
+  args.append<int32_t>(out.ndim());
+  args.append(axes_);
+  append_indices_arg(args, inputs, nidx, idx_ndim);
+
+  std::string kernel_name = fmt::format(
+      "mlx::core::cu::scatter<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}>",
+      dtype_to_cuda_type(out.dtype()),
+      dtype_to_cuda_type(idx_dtype),
+      op,
+      nidx,
+      idx_ndim,
+      large ? "int64_t" : "int32_t");
+
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  encoder.set_output_array(out);
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, upd, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+}
+
+void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("GatherAxis::eval_gpu");
+  assert(inputs.size() > 1);
+  const auto& src = inputs[0];
+  const auto& idx = inputs[1];
+
+  out.set_data(allocator::malloc(out.nbytes()));
+  if (out.size() == 0) {
+    return;
+  }
+
+  bool large = idx.size() > INT32_MAX || src.size() > INT32_MAX;
+
+  std::string module_name = fmt::format(
+      "gather_axis_{}_{}",
+      dtype_to_string(out.dtype()),
+      dtype_to_string(idx.dtype()));
+
+  auto& s = stream();
+  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
+    std::vector<std::string> kernel_names;
+    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
+      for (int contiguous = 0; contiguous < 4; ++contiguous) {
+        for (int large = 0; large <= 1; ++large) {
+          kernel_names.push_back(fmt::format(
+              "mlx::core::cu::gather_axis<{}, {}, {}, {}, {}, {}>",
+              dtype_to_cuda_type(out.dtype()),
+              dtype_to_cuda_type(idx.dtype()),
+              ndim,
+              contiguous & 1 ? true : false,
+              contiguous & 2 ? true : false,
+              large ? "int64_t" : "int32_t"));
+        }
+      }
+    }
+    return std::make_pair(jit_source_gather_axis, std::move(kernel_names));
+  });
+
+  size_t idx_size_pre = 1;
+  size_t idx_size_post = 1;
+  for (int i = 0; i < axis_; ++i) {
+    idx_size_pre *= idx.shape(i);
+  }
+  for (int i = axis_ + 1; i < idx.ndim(); ++i) {
+    idx_size_post *= idx.shape(i);
+  }
+  size_t idx_size_axis = idx.shape(axis_);
+
+  cu::KernelArgs args;
+  args.append(src);
+  args.append(idx);
+  args.append(out);
+  if (large) {
+    args.append<int64_t>(idx_size_pre);
+    args.append<int64_t>(idx_size_axis);
+    args.append<int64_t>(idx_size_post);
+  } else {
+    args.append<int32_t>(idx_size_pre);
+    args.append<int32_t>(idx_size_axis);
+    args.append<int32_t>(idx_size_post);
+  }
+  args.append(remove_index(idx.shape(), axis_));
+  args.append(remove_index(src.strides(), axis_));
+  args.append(remove_index(idx.strides(), axis_));
+  args.append<int32_t>(axis_);
+  args.append(src.shape(axis_));
+  args.append(src.strides(axis_));
+  args.append(idx.strides(axis_));
+
+  std::string kernel_name = fmt::format(
+      "mlx::core::cu::gather_axis<{}, {}, {}, {}, {}, {}>",
+      dtype_to_cuda_type(out.dtype()),
+      dtype_to_cuda_type(idx.dtype()),
+      src.ndim() - 1,
+      src.flags().row_contiguous,
+      idx.flags().row_contiguous,
+      large ? "int64_t" : "int32_t");
+
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  encoder.set_output_array(out);
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+}
+
+void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("ScatterAxis::eval_gpu");
+  assert(inputs.size() > 2);
+  const auto& src = inputs[0];
+  const auto& idx = inputs[1];
+  const auto& upd = inputs[2];
+
+  // Copy src into out.
+  CopyType copy_type;
+  if (src.data_size() == 1) {
+    copy_type = CopyType::Scalar;
+  } else if (src.flags().row_contiguous) {
+    copy_type = CopyType::Vector;
+  } else {
+    copy_type = CopyType::General;
+  }
+  copy_gpu(src, out, copy_type);
+
+  // Empty update.
+  if (upd.size() == 0) {
+    return;
+  }
+
+  bool large = idx.size() > INT32_MAX || src.size() > INT32_MAX;
+
+  const char* op = reduce_type_ == ScatterAxis::Sum ? "Sum" : "Assign";
+  std::string module_name = fmt::format(
+      "scatter_axis_{}_{}_{}",
+      dtype_to_string(out.dtype()),
+      dtype_to_string(idx.dtype()),
+      op);
+
+  auto& s = stream();
+  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
+    std::vector<std::string> kernel_names;
+    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
+      for (int contiguous = 0; contiguous < 4; ++contiguous) {
+        for (int large = 0; large <= 1; ++large) {
+          kernel_names.push_back(fmt::format(
+              "mlx::core::cu::scatter_axis<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}, {}>",
+              dtype_to_cuda_type(out.dtype()),
+              dtype_to_cuda_type(idx.dtype()),
+              op,
+              ndim,
+              contiguous & 1 ? true : false,
+              contiguous & 2 ? true : false,
+              large ? "int64_t" : "int32_t"));
+        }
+      }
+    }
+    return std::make_pair(jit_source_scatter_axis, std::move(kernel_names));
+  });
+
+  size_t idx_size_pre = 1;
+  size_t idx_size_post = 1;
+  for (int i = 0; i < axis_; ++i) {
+    idx_size_pre *= idx.shape(i);
+  }
+  for (int i = axis_ + 1; i < idx.ndim(); ++i) {
+    idx_size_post *= idx.shape(i);
+  }
+  size_t idx_size_axis = idx.shape(axis_);
+
+  cu::KernelArgs args;
+  args.append(upd);
+  args.append(idx);
+  args.append(out);
+  if (large) {
+    args.append<int64_t>(idx_size_pre);
+    args.append<int64_t>(idx_size_axis);
+    args.append<int64_t>(idx_size_post);
+  } else {
+    args.append<int32_t>(idx_size_pre);
+    args.append<int32_t>(idx_size_axis);
+    args.append<int32_t>(idx_size_post);
+  }
+  args.append(remove_index(idx.shape(), axis_));
+  args.append(remove_index(upd.strides(), axis_));
+  args.append(remove_index(idx.strides(), axis_));
+  args.append<int32_t>(axis_);
+  args.append(out.shape(axis_));
+  args.append(upd.strides(axis_));
+  args.append(idx.strides(axis_));
+
+  std::string kernel_name = fmt::format(
+      "mlx::core::cu::scatter_axis<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}, {}>",
+      dtype_to_cuda_type(out.dtype()),
+      dtype_to_cuda_type(idx.dtype()),
+      op,
+      idx.ndim() - 1,
+      upd.flags().row_contiguous,
+      idx.flags().row_contiguous,
+      large ? "int64_t" : "int32_t");
+
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  encoder.set_output_array(out);
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/iterators/general_iterator.cuh
+++ b/mlx/backend/cuda/iterators/general_iterator.cuh
@@ -0,0 +1,121 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <cuda/std/utility>
+
+#include "mlx/backend/cuda/kernel_utils.cuh"
+
+namespace mlx::core::cu {
+
+// Iterating non-contiguous array.
+template <typename Iterator, typename IdxT = int64_t>
+class general_iterator
+    : public thrust::
+          iterator_adaptor<general_iterator<Iterator, IdxT>, Iterator> {
+ public:
+  using super_t =
+      thrust::iterator_adaptor<general_iterator<Iterator, IdxT>, Iterator>;
+
+  using reference = typename super_t::reference;
+  using difference_type = typename super_t::difference_type;
+
+  __host__ __device__ general_iterator(
+      Iterator it,
+      IdxT index,
+      int ndim,
+      Shape shape,
+      Strides strides)
+      : super_t(it),
+        index_(index),
+        ndim_(ndim),
+        shape_(cuda::std::move(shape)),
+        strides_(cuda::std::move(strides)) {}
+
+  __host__ __device__ IdxT index() const {
+    return index_;
+  }
+
+  __host__ __device__ const Shape& shape() const {
+    return shape_;
+  }
+
+  __host__ __device__ const Strides& strides() const {
+    return strides_;
+  }
+
+ private:
+  friend class thrust::iterator_core_access;
+
+  __host__ __device__ bool equal(const general_iterator& other) const {
+    return this->base() == other.base() && this->index() == other.index();
+  }
+
+  __host__ __device__ void advance(difference_type n) {
+    this->index_ += n;
+  }
+
+  __host__ __device__ void increment() {
+    this->index_ += 1;
+  }
+
+  __host__ __device__ void decrement() {
+    this->index_ -= 1;
+  }
+
+  __host__ __device__ difference_type
+  distance_to(const general_iterator& other) const {
+    _CCCL_ASSERT(
+        this->base() == other.base(),
+        "Underlying iterator must point to same base iterator");
+    return other.index() - this->index();
+  }
+
+  // The dereference is device-only to avoid accidental running in host.
+  __device__ typename super_t::reference dereference() const {
+    IdxT offset = elem_to_loc(index_, shape_.data(), strides_.data(), ndim_);
+    return *(this->base() + offset);
+  }
+
+  IdxT index_;
+  int ndim_;
+  Shape shape_;
+  Strides strides_;
+};
+
+template <typename IdxT, typename Iterator>
+__host__ __device__ auto make_general_iterator(
+    Iterator it,
+    IdxT index,
+    int ndim,
+    Shape shape,
+    Strides strides) {
+  return general_iterator<Iterator, IdxT>(
+      it, index, ndim, cuda::std::move(shape), cuda::std::move(strides));
+}
+
+template <typename IdxT, typename Iterator>
+auto make_general_iterator(
+    Iterator it,
+    const std::vector<int32_t>& shape,
+    const std::vector<int64_t>& strides) {
+  return make_general_iterator<IdxT>(
+      it, 0, shape.size(), const_param(shape), const_param(strides));
+}
+
+template <typename IdxT, typename Iterator>
+auto make_general_iterators(
+    Iterator it,
+    IdxT size,
+    const std::vector<int32_t>& shape,
+    const std::vector<int64_t>& strides) {
+  auto ndim = shape.size();
+  auto shape_arg = const_param(shape);
+  auto strides_arg = const_param(strides);
+  return std::make_pair(
+      make_general_iterator<IdxT>(it, 0, ndim, shape_arg, strides_arg),
+      make_general_iterator<IdxT>(it, size, ndim, shape_arg, strides_arg));
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/iterators/strided_iterator.cuh
+++ b/mlx/backend/cuda/iterators/strided_iterator.cuh
@@ -0,0 +1,60 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_facade.h>
+
+namespace mlx::core::cu {
+
+// RandomAccessIterator for strided access to array entries.
+template <typename Iterator, typename Stride = int64_t>
+class strided_iterator
+    : public thrust::
+          iterator_adaptor<strided_iterator<Iterator, Stride>, Iterator> {
+ public:
+  using super_t =
+      thrust::iterator_adaptor<strided_iterator<Iterator, Stride>, Iterator>;
+
+  using reference = typename super_t::reference;
+  using difference_type = typename super_t::difference_type;
+
+  __host__ __device__ strided_iterator(Iterator it, Stride stride)
+      : super_t(it), stride_(stride) {}
+
+  __host__ __device__ Stride stride() const {
+    return stride_;
+  }
+
+ private:
+  friend class thrust::iterator_core_access;
+
+  __host__ __device__ bool equal(const strided_iterator& other) const {
+    return this->base() == other.base();
+  }
+
+  __host__ __device__ void advance(difference_type n) {
+    this->base_reference() += n * stride_;
+  }
+
+  __host__ __device__ void increment() {
+    this->base_reference() += stride_;
+  }
+
+  __host__ __device__ void decrement() {
+    this->base_reference() -= stride_;
+  }
+
+  __host__ __device__ difference_type
+  distance_to(const strided_iterator& other) const {
+    const difference_type dist = other.base() - this->base();
+    _CCCL_ASSERT(
+        dist % stride() == 0,
+        "Underlying iterator difference must be divisible by the stride");
+    return dist / stride();
+  }
+
+  Stride stride_;
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/jit_module.cpp
+++ b/mlx/backend/cuda/jit_module.cpp
@@ -0,0 +1,307 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/jit_module.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/version.h"
+
+#include "cuda_jit_sources.h"
+
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <unordered_map>
+
+#include <fmt/format.h>
+#include <nvrtc.h>
+
+namespace mlx::core::cu {
+
+namespace {
+
+#define CHECK_NVRTC_ERROR(cmd) check_nvrtc_error(#cmd, (cmd))
+
+void check_nvrtc_error(const char* name, nvrtcResult err) {
+  if (err != NVRTC_SUCCESS) {
+    throw std::runtime_error(
+        fmt::format("{} failed: {}", name, nvrtcGetErrorString(err)));
+  }
+}
+
+// Return the location of the CUDA toolkit.
+const std::string& cuda_home() {
+  static std::string home = []() -> std::string {
+    const char* home = std::getenv("CUDA_HOME");
+    if (home) {
+      return home;
+    }
+    home = std::getenv("CUDA_PATH");
+    if (home) {
+      return home;
+    }
+#if defined(__linux__)
+    home = "/usr/local/cuda";
+    if (std::filesystem::exists(home)) {
+      return home;
+    }
+#endif
+    throw std::runtime_error(
+        "Environment variable CUDA_HOME or CUDA_PATH is not set.");
+  }();
+  return home;
+}
+
+// Get the cache directory for storing compiled results.
+const std::filesystem::path& ptx_cache_dir() {
+  static std::filesystem::path cache = []() -> std::filesystem::path {
+    std::filesystem::path cache;
+    if (auto c = std::getenv("MLX_PTX_CACHE_DIR"); c) {
+      cache = c;
+    } else {
+      cache =
+          std::filesystem::temp_directory_path() / "mlx" / version() / "ptx";
+    }
+    if (!std::filesystem::exists(cache)) {
+      std::error_code error;
+      if (!std::filesystem::create_directories(cache, error)) {
+        return std::filesystem::path();
+      }
+    }
+    return cache;
+  }();
+  return cache;
+}
+
+// Try to read the cached |ptx| and |ptx_kernels| from |cache_dir|.
+bool read_cached_ptx(
+    const std::filesystem::path& cache_dir,
+    const std::string& module_name,
+    std::vector<char>* ptx,
+    std::vector<std::pair<std::string, std::string>>* ptx_kernels) {
+  if (cache_dir.empty()) {
+    return false;
+  }
+
+  auto ptx_path = cache_dir / (module_name + ".ptx");
+  std::error_code error;
+  auto ptx_size = std::filesystem::file_size(ptx_path, error);
+  if (error) {
+    return false;
+  }
+  std::ifstream ptx_file(ptx_path, std::ios::binary);
+  if (!ptx_file.good()) {
+    return false;
+  }
+  ptx->resize(ptx_size);
+  ptx_file.read(ptx->data(), ptx_size);
+
+  std::ifstream txt_file(cache_dir / (module_name + ".txt"), std::ios::binary);
+  std::string line;
+  while (std::getline(txt_file, line)) {
+    auto tab = line.find('\t');
+    if (tab != std::string::npos) {
+      ptx_kernels->emplace_back(line.substr(0, tab), line.substr(tab + 1));
+    }
+  }
+  return true;
+}
+
+// Write the |ptx| and |ptx_kernels| to |cache_dir| with |name|.
+void write_cached_ptx(
+    const std::filesystem::path& cache_dir,
+    const std::string& module_name,
+    const std::vector<char>& ptx,
+    const std::vector<std::pair<std::string, std::string>>& ptx_kernels) {
+  if (cache_dir.empty()) {
+    return;
+  }
+
+  std::ofstream ptx_file(cache_dir / (module_name + ".ptx"), std::ios::binary);
+  if (!ptx.empty()) {
+    ptx_file.write(&ptx.front(), ptx.size());
+  }
+  std::ofstream txt_file(cache_dir / (module_name + ".txt"), std::ios::binary);
+  for (const auto& [name, mangled] : ptx_kernels) {
+    txt_file << name << "\t" << mangled << std::endl;
+  }
+}
+
+// Return if |device|'s version is not newer than |major|.|minor| version.
+inline bool version_lower_equal(Device& device, int major, int minor) {
+  if (device.compute_capability_major() < major) {
+    return true;
+  } else if (device.compute_capability_major() == major) {
+    return device.compute_capability_minor() <= minor;
+  } else {
+    return false;
+  }
+}
+
+// Return whether NVRTC supports compiling to |device|'s SASS code.
+bool compiler_supports_device_sass(Device& device) {
+  int nvrtc_major, nvrtc_minor;
+  CHECK_NVRTC_ERROR(nvrtcVersion(&nvrtc_major, &nvrtc_minor));
+  if (nvrtc_major < 9) {
+    return false;
+  } else if (nvrtc_major == 9) {
+    return version_lower_equal(device, 7, 2);
+  } else if (nvrtc_major == 10) {
+    return version_lower_equal(device, 7, 5);
+  } else if (nvrtc_major == 11 && nvrtc_minor == 0) {
+    return version_lower_equal(device, 8, 0);
+  } else if (nvrtc_major == 11 && nvrtc_minor < 8) {
+    return version_lower_equal(device, 8, 6);
+  } else {
+    return true;
+  }
+}
+
+#define INCLUDE_PREFIX "mlx/backend/cuda/device/"
+
+constexpr const char* g_include_names[] = {
+    INCLUDE_PREFIX "atomic_ops.cuh",
+    INCLUDE_PREFIX "binary_ops.cuh",
+    INCLUDE_PREFIX "cast_op.cuh",
+    INCLUDE_PREFIX "cexpf.cuh",
+    INCLUDE_PREFIX "config.h",
+    INCLUDE_PREFIX "cucomplex_math.cuh",
+    INCLUDE_PREFIX "fp16_math.cuh",
+    INCLUDE_PREFIX "indexing.cuh",
+    INCLUDE_PREFIX "scatter_ops.cuh",
+    INCLUDE_PREFIX "unary_ops.cuh",
+    INCLUDE_PREFIX "ternary_ops.cuh",
+    INCLUDE_PREFIX "utils.cuh",
+};
+
+#undef INCLUDE_PREFIX
+
+constexpr const char* g_headers[] = {
+    jit_source_atomic_ops,
+    jit_source_binary_ops,
+    jit_source_cast_op,
+    jit_source_cexpf,
+    jit_source_config,
+    jit_source_cucomplex_math,
+    jit_source_fp16_math,
+    jit_source_indexing,
+    jit_source_scatter_ops,
+    jit_source_unary_ops,
+    jit_source_ternary_ops,
+    jit_source_utils,
+};
+
+} // namespace
+
+JitModule::JitModule(
+    Device& device,
+    const std::string& module_name,
+    const KernelBuilder& builder) {
+  // Check cache.
+  std::vector<char> ptx;
+  std::vector<std::pair<std::string, std::string>> ptx_kernels;
+  if (!read_cached_ptx(ptx_cache_dir(), module_name, &ptx, &ptx_kernels)) {
+    // Create program.
+    auto [source_code, kernel_names] = builder();
+    nvrtcProgram prog;
+    CHECK_NVRTC_ERROR(nvrtcCreateProgram(
+        &prog,
+        source_code.c_str(),
+        (module_name + ".cu").c_str(),
+        std::size(g_headers),
+        g_headers,
+        g_include_names));
+    std::unique_ptr<nvrtcProgram, void (*)(nvrtcProgram*)> prog_freer(
+        &prog,
+        [](nvrtcProgram* p) { CHECK_NVRTC_ERROR(nvrtcDestroyProgram(p)); });
+    for (const auto& name : kernel_names) {
+      CHECK_NVRTC_ERROR(nvrtcAddNameExpression(prog, name.c_str()));
+    }
+
+    // Compile program.
+    bool use_sass = compiler_supports_device_sass(device);
+    std::string compute = fmt::format(
+        "--gpu-architecture={}_{}{}",
+        use_sass ? "sm" : "compute",
+        device.compute_capability_major(),
+        device.compute_capability_minor());
+    std::string include = fmt::format("--include-path={}/include", cuda_home());
+    const char* args[] = {compute.c_str(), include.c_str()};
+    nvrtcResult compile_result =
+        nvrtcCompileProgram(prog, std::size(args), args);
+    if (compile_result != NVRTC_SUCCESS) {
+      size_t log_size;
+      CHECK_NVRTC_ERROR(nvrtcGetProgramLogSize(prog, &log_size));
+      std::vector<char> log(log_size + 1, 0);
+      CHECK_NVRTC_ERROR(nvrtcGetProgramLog(prog, log.data()));
+      throw std::runtime_error(
+          fmt::format("Failed to compile kernel: {}.", log.data()));
+    }
+
+    // Get mangled names of kernel names.
+    for (const auto& name : kernel_names) {
+      const char* mangled;
+      CHECK_NVRTC_ERROR(nvrtcGetLoweredName(prog, name.c_str(), &mangled));
+      ptx_kernels.emplace_back(name, mangled);
+    }
+
+    // Get ptx data.
+    size_t ptx_size;
+    if (use_sass) {
+      CHECK_NVRTC_ERROR(nvrtcGetCUBINSize(prog, &ptx_size));
+    } else {
+      CHECK_NVRTC_ERROR(nvrtcGetPTXSize(prog, &ptx_size));
+    }
+    ptx.resize(ptx_size, 0);
+    if (use_sass) {
+      CHECK_NVRTC_ERROR(nvrtcGetCUBIN(prog, ptx.data()));
+    } else {
+      CHECK_NVRTC_ERROR(nvrtcGetPTX(prog, ptx.data()));
+    }
+    write_cached_ptx(ptx_cache_dir(), module_name, ptx, ptx_kernels);
+  }
+
+  // Load module.
+  char jit_log[4089] = {};
+  CUjit_option options[] = {
+      CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
+  void* values[] = {jit_log, reinterpret_cast<void*>(std::size(jit_log) - 1)};
+  CUresult jit_result = cuModuleLoadDataEx(
+      &module_, ptx.data(), std::size(options), options, values);
+  if (jit_result != CUDA_SUCCESS) {
+    throw std::runtime_error(fmt::format(
+        "Failed to load compiled {} kernel: {}.", module_name, jit_log));
+  }
+
+  // Load kernels.
+  for (const auto& [name, mangled] : ptx_kernels) {
+    CUfunction kernel;
+    CHECK_CUDA_ERROR(cuModuleGetFunction(&kernel, module_, mangled.c_str()));
+    kernels_[name] = kernel;
+  }
+}
+
+JitModule::~JitModule() {
+  CHECK_CUDA_ERROR(cuModuleUnload(module_));
+}
+
+CUfunction JitModule::get_kernel(const std::string& kernel_name) {
+  auto it = kernels_.find(kernel_name);
+  if (it == kernels_.end()) {
+    throw std::runtime_error(
+        fmt::format("There is no kernel named {}.", kernel_name));
+  }
+  return it->second;
+}
+
+JitModule& get_jit_module(
+    const mlx::core::Device& device,
+    const std::string& name,
+    const KernelBuilder& builder) {
+  static std::unordered_map<std::string, JitModule> map;
+  auto it = map.find(name);
+  if (it == map.end()) {
+    it = map.try_emplace(name, cu::device(device), name, builder).first;
+  }
+  return it->second;
+}
+
+} // namespace mlx::core::cu
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Angelos Katharopoulos	a9c720e8cd	Improve the ring backend initialization	2025-07-11 15:31:28 -07:00
Cheng	8347575ba1	[CUDA] Implement Scan kernel (#2347 ) * Contiguous scan * Strided scan * Enable tests * Fix failing logaddexp test * Use cexpf in Metal	2025-07-10 16:54:12 -07:00
Angelos Katharopoulos	b6eec20260	Fix edge check in qmm_n QuantizedLoader (#2355 )	2025-07-10 16:28:50 -07:00
Angelos Katharopoulos	0eb035b4b1	Fix type promotion in Adam with bias correction (#2350 )	2025-07-10 11:14:42 -07:00
Cheng	afb9817599	[CUDA] Put version in ptx cache dir path (#2352 )	2025-07-10 07:24:21 -07:00
Cheng	8fb3e7a26c	[CUDA] Set current device before cudaGraphLaunch (#2351 )	2025-07-10 07:24:02 -07:00
jhavukainen	8c7bc30ce4	Align mlx::core::min op nan propagation with NumPy (#2346 )	2025-07-10 06:20:43 -07:00
Cheng	85873cb162	[CUDA] Do vectorized store/load in contiguous elementwise ops (#2342 ) * Do vectorized store/load in unary ops * Do vectorized store/load in binary_two ops * Do vectorized store/load in copy ops * Do vectorized store/load in ternary ops * Use int32_t for IdxT * binary => binary_two in binary_two.cu * Fix tests on large arrays * Use uint as index type * Contig uses uint as index and non-contig uses int	2025-07-09 18:48:43 -07:00
Awni Hannun	e14ee12491	add zero for argsort vjp (#2345 )	2025-07-09 14:37:14 -07:00
jhavukainen	8b9a3f3cea	Align mlx::core::max op nan propagation with NumPy (#2339 ) * Make max op NaN propagation rules align with numpy * Adding benchmarks and testing for max op nanpropagation * Pre-commit formatting * Fix max complex64 nan propagation and add test * Improve the cpp unittest * Only check nans on non-integral types in simd_reduce_impl. * Cleanup using namespace alias * Add cpu Max nanpropagation. Fix a small fib in cpu max dispatch data types for int8/int16. * Make the max nanpropagation test more meaningful for integer types * Remove tuple unpacking syntax to comply with earlier python versions. Add cuda skip to nanpropagation tests, fix cuda implementation in a separate PR.	2025-07-09 11:26:27 -07:00
Awni Hannun	fb4e8b896b	patch bump (#2343 )	2025-07-08 14:26:07 -07:00
Cheng	2ca533b279	Fix compilation with CUDA 11 (#2331 )	2025-07-07 20:00:43 -07:00
Angelos Katharopoulos	4a9b29a875	MoE backward improvements (#2335 )	2025-07-07 17:59:53 -07:00
Awni Hannun	a4fcc893cd	auto build linux release (#2341 )	2025-07-07 09:29:23 -07:00
Cheng	9d10239af7	[CUDA] Do vectorized store/load in binary ops (#2330 )	2025-07-07 08:44:14 -07:00
Cheng	19facd4b20	Build with all cpu cores by default (#2336 )	2025-07-07 06:06:45 -07:00
Angelos Katharopoulos	f5299f72cd	Fix layernorm race condition (#2340 )	2025-07-07 06:06:01 -07:00
Cheng	0e0d9ac522	[CUDA] Add MLX_CUDA_GRAPH_CACHE_SIZE env for setting graph cache size (#2329 )	2025-07-05 08:33:29 -07:00
Awni Hannun	8917022deb	fix graphs for older cuda (#2328 )	2025-07-02 19:37:58 -07:00
Awni Hannun	ec0d5db67b	[CUDA] Switch to CUDA graphs (#2317 ) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment	2025-07-02 15:59:13 -07:00
Cheng	e76e9b87f0	Fix compilation error from integral_constant (#2326 )	2025-07-02 06:04:38 -07:00
Awni Hannun	cfb6a244ea	allow parameters to be deleted (#2325 )	2025-07-01 21:27:23 -07:00
Awni Hannun	58f3860306	patch bump (#2324 )	2025-07-01 12:12:16 -07:00
Awni Hannun	dd4f53db63	use fp32 for testing, add more complex ops (#2322 )	2025-07-01 07:30:00 -07:00
Angelos Katharopoulos	3d5e17e507	MLX_SWITCH macros to templates (#2320 )	2025-07-01 01:33:44 -07:00
Awni Hannun	33bf1a244b	Fix module update in strict mode (#2321 ) * fix module update in strict mode * allow GELU to be pickled	2025-06-29 11:12:29 -07:00
Angelos Katharopoulos	772f471ff2	[CUDA] Fix reductions (#2314 )	2025-06-27 12:59:20 -07:00
Angelos Katharopoulos	2c11d10f8d	Split broadcast so it is always fused in compile (#2318 )	2025-06-26 22:08:18 -07:00
Angelos Katharopoulos	656ed7f780	Fix get 2d grid dims (#2316 )	2025-06-25 13:03:09 -07:00
Awni Hannun	81bb9a2a9e	Compile float64 functions on CPU (#2311 )	2025-06-24 10:18:52 -07:00
Angelos Katharopoulos	5adf185f86	Fix `update_modules()` when providing a subset (#2308 )	2025-06-20 17:19:46 -07:00
Awni Hannun	c9a9180584	Cuda perf tuning (#2307 ) * perf tuning * fix adding inputs arrays in matmul / srot * format * fix	2025-06-20 14:50:57 -07:00
Awni Hannun	76831ed83d	Build CUDA release in Circle (#2306 ) * cuda release * add license	2025-06-19 15:26:36 -07:00
Angelos Katharopoulos	b3d7b85376	Make ptx cache settable by environment variable (#2304 )	2025-06-17 23:55:56 -07:00
Awni Hannun	cad5c0241c	[CUDA] synch properly waits for all tasks to finish and clear (#2303 ) * cuda synch properly waits for all tasks to finish and clear * fix copy	2025-06-17 12:03:25 -07:00
Awni Hannun	b8022c578a	divmod, partition, sort fixes (#2302 )	2025-06-16 18:49:32 -07:00
Awni Hannun	bc53f8293f	Cuda bug fixes 2 (#2298 ) * more bug fixes * more bug fixes * format	2025-06-16 13:14:46 -07:00
Awni Hannun	c552ff2451	[CUDA] Fix back-end bugs and enable corresponding tests (#2296 ) * Fix some cuda back-end bugs and enable corresponding tests * more fixes * enable more tests * format	2025-06-16 08:45:40 -07:00
Awni Hannun	4fda5fbdf9	add python testing for cuda with ability to skip list of tests (#2295 )	2025-06-15 10:56:48 -07:00
Angelos Katharopoulos	580776559b	RoPE for CUDA (#2293 ) * First working CUDA rope * Fix random	2025-06-15 06:08:07 -07:00
Awni Hannun	a14aaa7c9d	Fix cuda arg reduce (#2291 )	2025-06-14 17:54:00 -07:00
Awni Hannun	a6d780154f	fix cuda gemm for bf16 (#2288 )	2025-06-13 22:10:46 -07:00
Awni Hannun	6871e2eeb7	fix cuda jit (#2287 )	2025-06-13 19:21:46 -07:00
Awni Hannun	8402a2acf4	Fix complex power and print (#2286 ) * fix complex power and print * fix complex matmul shape	2025-06-13 11:13:00 -07:00
Jagrit Digani	fddb6933e1	Collection of refactors (#2274 ) * Refactor gemv into a function * Refactor splitk step 1 * Refactor split k axpby * Rearrange steel_gemm_regular * Redirect steel_gemm_regular * Add axpby routing to steel_matmul_regular * Refactor AddMM step 1 * Redirect steel_gemm * Update addmm * Comments and format * Some cleanup * Add architecture gen to device * Update no copy condition in normalization to account for axis size 1	2025-06-13 10:44:56 -07:00
Cheng	c8b4787e4e	CUDA backend: indexing ops (#2277 )	2025-06-12 21:44:19 -07:00
Awni Hannun	2188199ff8	[CUDA] ternary with select op (#2283 ) * cuda ternary with select op * comment + fix * fix	2025-06-12 20:24:43 -07:00
Awni Hannun	aa07429bad	Fix cuda build (#2284 )	2025-06-12 17:48:05 -07:00
Awni Hannun	918761a25a	[CUDA] RMSNorm and VJP (#2280 ) * rms norm start * nit	2025-06-12 17:09:49 -07:00
Cheng	a4fc671d3e	CUDA backend: compile (#2276 ) * CUDA backend: compile * Rename kernels/ to device/	2025-06-12 17:08:39 -07:00
Awni Hannun	f5f65ef48c	Make sliceUpdate general (#2282 ) * Make sliceUpdate general * fix	2025-06-12 16:48:54 -07:00
Cheng	c2dd81a8aa	Fix warnings from latest CUDA toolkit (#2275 )	2025-06-12 06:03:01 -07:00
Cheng	d7e680ffe4	CUDA backend: layernorm (#2271 )	2025-06-11 15:48:32 -07:00
Cheng	c371baf53a	CUDA backend: softmax (#2272 )	2025-06-11 13:55:22 -07:00
Cheng	ccf78f566c	CUDA backend: argreduce (#2270 )	2025-06-11 13:26:17 -07:00
Cheng	c9fa68664a	CUDA backend: reduce (#2269 )	2025-06-11 11:22:25 -07:00
Awni Hannun	c35f4d089a	start cuda circle config (#2256 ) * rebase * fix metal kernel linking issue on cuda * start cuda circle config	2025-06-10 21:19:47 -07:00
Angelos Katharopoulos	8590c0941e	Add load_safe to the general conv loaders (#2258 )	2025-06-10 20:58:16 -07:00
Cheng	095163b8d1	Fix building cpp benchmarks on Linux (#2268 )	2025-06-10 17:10:24 -07:00
Cheng	99c33d011d	rebase + nit (#2260 ) Co-authored-by: Awni Hannun <awni@apple.com>	2025-06-10 10:51:51 -07:00
Awni Hannun	62fecf3e13	fix conv export (#2265 )	2025-06-10 09:34:01 -07:00
Cheng	7c4eb5d03e	CUDA backend: random (#2261 )	2025-06-10 08:59:56 -07:00
Cheng	bae9a6b404	CUDA backend: sort (#2262 ) Co-authored-by: Awni Hannun <awni@apple.com>	2025-06-10 08:59:47 -07:00
Christopher Fleetwood	004c1d8ef2	Report number of missing parameters (#2264 ) * chore: inform * chore: format --------- Co-authored-by: FL33TW00D <FL33TW00D@users.noreply.github.com>	2025-06-10 06:37:50 -07:00
Cheng	7ebb2e0193	CUDA backend: binary ops (#2259 )	2025-06-10 06:37:40 -07:00
Awni Hannun	9ce77798b1	fix export to work with gather/scatter axis (#2263 )	2025-06-09 20:37:27 -07:00
Cheng	f8bad60609	CUDA backend: unary ops (#2158 )	2025-06-09 06:45:08 -07:00
Emmanuel Ferdman	5866b3857b	Refactor the lu test (#2250 ) Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>	2025-06-07 06:12:08 -07:00
Awni Hannun	1ca616844b	Fix unintuitive metal kernel caching (#2242 ) * Fix unintuitive metal kernel caching * alternative solution	2025-06-06 20:08:15 -07:00
Angelos Katharopoulos	2e8cf0b450	Change layernorms to two pass algorithm (#2246 )	2025-06-06 13:34:56 -07:00
Cheng	24f89173d1	CUDA backend: matmul (#2241 )	2025-06-06 12:24:04 -07:00
Awni Hannun	c6a20b427a	Improve metal elementwise kernels (#2247 ) * improve metal elementwise kernels * compile and copy * fix jit	2025-06-06 11:37:40 -07:00
Awni Hannun	a5ac9244c4	fix linux linking error (#2248 )	2025-06-06 10:41:51 -07:00
Awni Hannun	c763fe1be0	default strict mode for module update and update_modules (#2239 )	2025-06-05 15:27:02 -07:00
Cheng	52dc8c8cd5	Add profiler annotations in common primitives for CUDA backend (#2244 )	2025-06-04 19:55:12 -07:00
Angelos Katharopoulos	aede70e81d	Perf regression fix (#2243 )	2025-06-03 17:55:12 -07:00
Cheng	85a8beb5e4	Avoid atomic updates across CPU/GPU in CUDA event (#2231 )	2025-06-03 16:49:06 -07:00
Cheng	0bb89e9e5f	Share more common code in Compiled (#2240 ) * Share more common code in Compiled * Remove build_lib_name	2025-06-03 16:48:50 -07:00
Cheng	5685ceb3c7	Avoid invoking allocator::malloc when creating CUDA event (#2232 )	2025-06-03 16:48:40 -07:00
Suryash Malviya	0408ba0a76	Optimizing Complex Matrix Multiplication using Karatsuba’s Algorithm (#2220 ) * Implementing Complex Matmul using Karatsuba Algorithm * Implemented Karatsuba's Algorithm for complex matmul and pre-commit them * fix --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-06-02 15:58:46 -07:00
Awni Hannun	cbad6c3093	version (#2237 )	2025-06-02 15:58:33 -07:00
Cheng	1b021f6984	Fast primitives decide when to use the fallback (#2216 )	2025-06-02 13:26:37 -07:00
Cheng	95b7551d65	Do not check event.is_signaled() in eval_impl (#2230 )	2025-06-02 13:23:34 -07:00
Cheng	db5a7c6192	Add memory cache to CUDA backend (#2221 ) * Move BufferCache out of allocator * Add memory cache to cuda backend allocator * Simplify BufferCache assuming buf can not be null	2025-05-30 12:12:54 -07:00
Awni Hannun	6ef2f67e7f	5bit quants (#2226 ) * 5bit quants * 5bit quants	2025-05-30 12:12:10 -07:00
Cheng	f76ee1ffd2	Move some dims utils to common (#2223 )	2025-05-29 06:48:30 -07:00
Cheng	54a71f270a	Remove unused defines (#2217 )	2025-05-23 06:14:58 -07:00
Awni Hannun	55b4062dd8	copyright in docs (#2214 )	2025-05-21 17:13:04 -07:00
Cheng	79071bfba4	Fix out-of-bounds default value in logsumexp/softmax (#2213 )	2025-05-21 07:25:16 -07:00
Cheng	7774b87cbd	Remove redundant simd_sum in logsumexp (#2210 )	2025-05-21 07:25:03 -07:00
Cheng	35c87741cf	Build for compute capability 70 instead of 75 (#2209 )	2025-05-20 19:42:48 -07:00
Jack Wind	4cbe605214	Feat: Allow per-target Metal debug flags (#2201 ) * feat: allow per-target Metal debug flags * formatting fix	2025-05-20 10:22:26 -07:00
Clement Liaw	ab8883dd55	include mlx::core::version() symbols in the mlx static library (#2207 )	2025-05-20 07:39:11 -07:00
Awni Hannun	eebe73001a	fix large arg reduce (#2206 )	2025-05-19 13:10:44 -07:00
Angelos Katharopoulos	0359bf02c9	Nearest upsample (#2202 )	2025-05-19 11:23:38 -07:00
Cheng	237f9e58a8	Fix BEFORE keyword in target_include_directories (#2204 )	2025-05-19 06:10:44 -07:00
Awni Hannun	8576e6fe36	fix conv2d bug + faster conv 1d (#2195 ) * fix conv2d bug + faster conv 1d * revert sort + flaky test	2025-05-18 06:05:11 -07:00
Angelos Katharopoulos	0654543dcc	Add complex eigh (#2191 )	2025-05-18 00:18:43 -07:00
Awni Hannun	48ef3e74e2	reduce vjp for all and any (#2193 )	2025-05-16 08:38:49 -07:00
Cheng	7d4b378952	Include cuda_bf16.h for bfloat16 overloads (#2192 ) * Include cuda_bf16.h for bfloat16 overloads * Add NO_GPU_MULTI(Eig) in cuda backend	2025-05-16 06:44:42 -07:00
Jack Wind	7ff5c41e06	Add set_threadgroup_memory_length to CommandEncoder (#2183 )	2025-05-16 00:28:03 -07:00
Awni Hannun	602f43e3d1	fix conv grad (#2187 )	2025-05-15 19:20:36 -07:00
Awni Hannun	a2cadb8218	real and imag properties (#2189 )	2025-05-15 18:17:50 -07:00
Awni Hannun	c1eb9d05d9	non-symmetric eig and eigh (#2188 )	2025-05-15 13:01:44 -07:00
Angelos Katharopoulos	cf6c939e86	Fix some complex vjps (#2178 )	2025-05-14 23:37:12 -07:00
Angelos Katharopoulos	130df35e1b	Add random normal distribution for complex numbers (#2182 )	2025-05-13 22:43:45 -07:00
Cheng	0751263dec	Fix typo in row_reduce_small (#2179 )	2025-05-13 20:19:54 -07:00
Cheng	eca2f3eb97	Add remove_index utility (#2173 )	2025-05-13 17:09:56 -07:00
Angelos Katharopoulos	3aa9cf3f9e	Fix put_along_axis for empty arrays (#2181 )	2025-05-13 14:27:53 -07:00
Awni Hannun	8f3d208dce	Close a couple edge case bugs: hadamard and addmm on empty inputs (#2177 ) * handle hadamard and addmm on empty inputs * fix	2025-05-12 10:48:57 -07:00
Ivan Fioravanti	caaa3f1f8c	Small typos in mx.metal deprecations (#2176 )	2025-05-11 06:03:47 -07:00
Awni Hannun	659a51919f	patch bump (#2162 )	2025-05-09 14:35:14 -07:00
Awni Hannun	6661387066	Fix fft for integer overflow (#2161 )	2025-05-09 14:25:12 -07:00
ATurker	a7fae8a176	fix: conv_general differences between gpu, cpu (#2070 ) * fix general_conv padding * fix bugs * add test --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-05-09 10:26:52 -07:00
Cheng	0cae0bdac8	CUDA backend: backbone (#2075 )	2025-05-06 21:26:46 -07:00
Awni Hannun	5a1a5d5ed1	fix input coherent kernel launch (#2153 )	2025-05-05 17:30:50 -07:00
Cheng	1683975acf	Move common gpu primitives to backend/gpu (#2145 )	2025-05-05 13:45:29 -07:00
Awni Hannun	af705590ac	fix batched vector sdpa (#2152 )	2025-05-05 13:13:03 -07:00
Awni Hannun	825124af8f	fix bw for elementwise ops (#2151 ) * fix bw for elementwise ops * add compile * fix * fix * fix * fix	2025-05-05 06:15:04 -07:00
Awni Hannun	9c5e7da507	fix compile merging (#2150 )	2025-05-02 15:08:50 -07:00
Angelos Katharopoulos	481349495b	GPU Hadamard for large N (#1879 )	2025-05-01 17:19:17 -07:00
Awni Hannun	9daa6b003f	fix shapeless export (#2148 )	2025-05-01 15:02:02 -07:00
Angelos Katharopoulos	a3a632d567	Fix the launcher when ran locally (#2147 )	2025-05-01 12:56:09 -07:00
Awni Hannun	e496c5a4b4	fix integer overflow in qmm (#2143 )	2025-04-30 09:28:56 -07:00
Cheng	ea890d8710	Remove metal-only tests (#2139 )	2025-04-30 09:08:39 -07:00
Awni Hannun	aa5d84f102	Allow quant layer to be unfrozen (#2142 )	2025-04-30 09:08:29 -07:00
Awni Hannun	f1606486d2	Generalize gpu backend (#2138 ) * generalize gpu backend * fix no_gpu build * fix no_gpu build * generalize gpu backend	2025-04-30 09:08:17 -07:00
Cheng	87720a8908	Fix building with uv (#2141 )	2025-04-30 06:04:07 -07:00
Aashiq Dheeraj	bb6565ef14	add fftshift and ifftshift fft helpers (#2135 ) * add fftshift and ifftshift fft helpers * address comments * axes have to be iterable * fix fp error in roll + add test --------- Co-authored-by: Aashiq Dheeraj <aashiq@aashiq-mbp-m4.local>	2025-04-29 22:13:45 -07:00
Awni Hannun	7bb063bcb3	Enable vjp for quantized scale and bias (#2129 ) * Enable vjp for quantized scale and bias * higher tol	2025-04-29 13:03:09 -07:00
Alex Chi Z.	b36dd472bb	return library if it is successfully loaded (#2131 )	2025-04-29 07:30:36 -07:00
hdeng-apple	167b759a38	Fix typos (#2136 )	2025-04-29 07:26:05 -07:00
charan-003	99b9868859	Clarify dimension notation in conv1d, conv2d, and conv3d docstrings (#2123 ) * Clarify dimension notation in conv1d, conv2d, and conv3d docstrings * Updating transposed convs in conv1d, conv2d, and conv3d --------- Co-authored-by: Sai Charan Arvapally <saicharan@Sais-MacBook-Pro.local>	2025-04-25 12:18:30 -07:00
1ndig0	6b2d5448f2	Fix the error message in `mx.right_shift` and `mx.left_shift` (#2121 ) * update right_shift and lef_shift * simplify --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-04-25 09:14:28 -07:00
Awni Hannun	eaf709b83e	patch (#2119 )	2025-04-24 16:11:07 -07:00
Angelos Katharopoulos	f0e70afff0	Fix swift pm load (#2117 )	2025-04-24 10:58:29 -07:00
hdeng-apple	86984cad68	Remove static initializers (#2059 ) * Remove static initializers in device.cpp, load.cpp, pocketfft.h * Remove static initializer InTracing::trace_stack * Remove static initializer of CompilerCache cache * Revert changes in pocketfft.h * Remove duplicate private section of thread_pool()	2025-04-24 06:14:49 -07:00
Awni Hannun	fbc89e3ced	fix pinv (#2110 )	2025-04-23 13:08:28 -07:00
hdeng-apple	38c1e720c2	Search mlx.metallib in macOS framework "Resources" dir (#2061 ) --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com>	2025-04-23 09:53:13 -07:00
Param Thakkar	600e87e03c	Added output_padding parameters in conv_transpose (#2092 )	2025-04-23 09:26:33 -07:00
Hyunsung Lee	3836445241	Add broadcast_shapes in python API (#2091 )	2025-04-22 18:57:39 -07:00
Yury Popov	1d2c9d6a07	Complex scan (#2094 )	2025-04-22 18:56:28 -07:00
Awni Hannun	e8ac6bd2f5	irfft throws instead of segfaults on scalars (#2109 )	2025-04-22 10:25:55 -07:00
Awni Hannun	fdadc4f22c	Add more complex unary ops (#2101 )	2025-04-21 13:04:54 -07:00
Awni Hannun	79b527f45f	conv vmap (#2102 )	2025-04-21 13:04:39 -07:00
Awni Hannun	dc4eada7f0	Use unordered map for kwargs in export/import (#2087 ) * use unordered map for kwargs in export/import * comment	2025-04-21 07:17:22 -07:00
Cheng	70ebc3b598	Return const ref in array::data_shared_ptr (#2100 )	2025-04-21 07:17:09 -07:00
Cheng	b13f2aed16	Introduce macros for dispatching dynamic dtypes as static types (#2073 )	2025-04-19 06:16:30 -07:00
Param Thakkar	5f04c0f818	Fixed shift operations issue (#2080 ) * Fixed shift operations issue * Added tests and fixes * Fixed loop syntax error * Added tests for bool * Fixed typo	2025-04-18 14:28:33 -07:00
Awni Hannun	55935ccae7	fix py gc edge case (#2079 )	2025-04-18 12:46:53 -07:00
Awni Hannun	b529515eb1	minor bump (#2081 )	2025-04-17 14:57:11 -07:00
Angelos Katharopoulos	3cde719eb7	Route to gather qmm only for many tokens per expert (#2082 )	2025-04-17 14:53:08 -07:00
Angelos Katharopoulos	5de6d94a90	Gather qmm batched kernel and refactoring of quantized (#2078 )	2025-04-17 13:53:11 -07:00
Angelos Katharopoulos	99eefd2ec0	Gather mm new kernel and small refactoring (#2040 )	2025-04-14 16:37:36 -07:00