Fix update_modules() when providing a subset (#2308 )

Cuda perf tuning (#2307 )
* perf tuning * fix adding inputs arrays in matmul / srot * format * fix
2025-06-24 09:21:16 +08:00 · 2025-06-20 17:19:46 -07:00 · 2025-06-20 14:50:57 -07:00 · 2025-06-19 15:26:36 -07:00 · 2025-06-17 23:55:56 -07:00 · 2025-06-17 12:03:25 -07:00
331 changed files with 22477 additions and 5329 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -16,6 +16,9 @@ parameters:
  linux_release:
    type: boolean
    default: false
+  cuda_release:
+    type: boolean
+    default: false

 jobs:
  build_documentation:
@ -104,7 +107,7 @@ jobs:
          command: |
            echo "stubs"
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Run Python tests
          command: |
@ -162,7 +165,7 @@ jobs:
          command: |
            source env/bin/activate
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Run Python tests
          command: |
@ -212,6 +215,29 @@ jobs:
              METAL_DEBUG_ERROR_MODE=0 \
              python -m xmlrunner discover -v python/tests -o test-results/gpu_jit

+  cuda_build_and_test:
+    machine:
+      image: linux-cuda-12:default
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - run:
+          name: Install Python package
+          command: |
+            sudo apt-get update
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            python -m venv env
+            source env/bin/activate
+            CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              pip install -e ".[dev]"
+      - run:
+          name: Run Python tests
+          command: |
+            source env/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
+            LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
+
  build_release:
    parameters:
      python_version:
@ -251,7 +277,7 @@ jobs:
          name: Install Python package
          command: |
            source env/bin/activate
-            DEV_RELEASE=1 \
+            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
              pip install . -v
      - run:
@ -259,7 +285,7 @@ jobs:
          command: |
            source env/bin/activate
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Build Python package
          command: |
@ -318,7 +344,7 @@ jobs:
              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              pip install . -v
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
            << parameters.extra_env >> \
              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              python -m build --wheel
@ -332,6 +358,48 @@ jobs:
      - store_artifacts:
          path: wheelhouse/

+  build_cuda_release:
+    parameters:
+      python_version:
+        type: string
+        default: "3.9"
+      extra_env:
+        type: string
+        default: "DEV_RELEASE=1"
+    machine:
+      image: linux-cuda-12:default
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - run:
+          name: Build wheel
+          command: |
+            sudo apt-get update
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            python -m venv env
+            source env/bin/activate
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              pip install ".[dev]" -v
+            python setup.py generate_stubs
+            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              python -m build --wheel
+            bash python/scripts/repair_cuda.sh
+      - run:
+          name: Upload package
+          command: |
+            source env/bin/activate
+            twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+
 workflows:
  build_and_test:
    when:
@ -348,6 +416,7 @@ workflows:
            parameters:
              macosx_deployment_target: ["13.5", "14.0"]
      - linux_build_and_test
+      - cuda_build_and_test 
      - build_documentation 

  build_pypi_release:
@ -368,6 +437,68 @@ workflows:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
              build_env: ["PYPI_RELEASE=1"]
+              xcode_version: ["16.2.0", "15.0.0"]
+            exclude:
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.9"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.10"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.11"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.12"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.13"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+                build_env: "PYPI_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+                build_env: "PYPI_RELEASE=1"
      - build_documentation:
          filters:
            tags:
@ -393,6 +524,8 @@ workflows:
              macosx_deployment_target: ["13.5", "14.0"]
      - linux_build_and_test:
          requires: [ hold ]
+      - cuda_build_and_test:
+          requires: [ hold ]
  nightly_build:
    when:
      and:
@ -404,6 +537,53 @@ workflows:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              xcode_version: ["16.2.0", "15.0.0"]
+            exclude:
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.9"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.10"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.11"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.12"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.13"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
  weekly_build:
    when:
      and:
@ -416,6 +596,68 @@ workflows:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
              build_env: ["DEV_RELEASE=1"]
+              xcode_version: ["16.2.0", "15.0.0"]
+            exclude:
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.9"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.10"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.11"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.12"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "13.5"
+                xcode_version: "16.2.0"
+                python_version: "3.13"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "14.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.9"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.10"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.11"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.12"
+                build_env: "DEV_RELEASE=1"
+              - macosx_deployment_target: "15.0"
+                xcode_version: "15.0.0"
+                python_version: "3.13"
+                build_env: "DEV_RELEASE=1"
  linux_test_release:
    when:
      and:
@ -427,3 +669,14 @@ workflows:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
              extra_env: ["PYPI_RELEASE=1"]
+  cuda_test_release:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.cuda_release >>
+    jobs:
+      - build_cuda_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              extra_env: ["PYPI_RELEASE=1"]
--- a/.gitignore
+++ b/.gitignore
@ -36,6 +36,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+uv.lock

 # vim
 *.swp
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,6 +34,7 @@ option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
 option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
 option(MLX_BUILD_CPU "Build cpu backend" ON)
+option(MLX_BUILD_CUDA "Build cuda backend" OFF)
 option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
 option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
@ -83,6 +84,10 @@ if(MLX_BUILD_METAL)
  set(QUARTZ_LIB "-framework QuartzCore")
 endif()

+if(MLX_BUILD_CUDA)
+  enable_language(CUDA)
+endif()
+
 if(MLX_BUILD_METAL AND NOT METAL_LIB)
  message(STATUS "Metal not found. Unable to build GPU")
  set(MLX_BUILD_METAL OFF)
@ -226,6 +231,9 @@ target_include_directories(
  mlx PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
             $<INSTALL_INTERFACE:include>)

+# Do not add mlx_EXPORTS define for shared library.
+set_target_properties(mlx PROPERTIES DEFINE_SYMBOL "")
+
 FetchContent_Declare(
  fmt
  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,6 @@
 include CMakeLists.txt
+include mlx.pc.in
 recursive-include mlx/ *
+include cmake/*
 include python/src/*
 include python/mlx/py.typed # support type hinting as in PEP-561
--- a/benchmarks/cpp/irregular_strides.cpp
+++ b/benchmarks/cpp/irregular_strides.cpp
@ -1,5 +1,6 @@
 // Copyright © 2023 Apple Inc.

+#include <cstring>
 #include <iostream>
 #include <sstream>

--- a/benchmarks/python/conv_unaligned_bench.py
+++ b/benchmarks/python/conv_unaligned_bench.py
@ -0,0 +1,107 @@
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_2D
+
+
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+
+    return pt_conv_2D
+
+
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("mps")
+
+    torch.mps.synchronize()
+
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    dtype = "float32"
+    shapes = (
+        (4, 32, 32, 21, 3, 3, 128),
+        (4, 32, 32, 21, 3, 3, 37),
+        (4, 32, 32, 370, 3, 3, 370),
+        (4, 32, 32, 370, 7, 7, 128),
+        (2, 320, 640, 21, 7, 7, 21),
+    )
+    for N, H, W, C, kh, kw, O in shapes:
+        time_mlx, time_torch = bench_shape(
+            N, H, W, C, kh, kw, O, (1, 1), (0, 0), 1, dtype
+        )
+        diff = time_torch / time_mlx - 1.0
+
+        print(
+            f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kh:2d}, {kw:2d}, {C:3d}), {dtype}, {100. * diff:+5.2f}%"
+        )
+        if time_mlx >= 2.0 * time_torch:
+            print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/gather_mm_bench.py
+++ b/benchmarks/python/gather_mm_bench.py
@ -0,0 +1,74 @@
+# Copyright © 2025 Apple Inc.
+
+import mlx.core as mx
+from time_utils import time_fn
+
+N = 1024
+D = 1024
+M = 1024
+E = 32
+I = 4
+
+
+def gather_sort(x, indices):
+    N, M = indices.shape
+    indices = indices.flatten()
+    order = mx.argsort(indices)
+    inv_order = mx.argsort(order)
+    return x.flatten(0, -3)[order // M], indices[order], inv_order
+
+
+def scatter_unsort(x, inv_order, shape=None):
+    x = x[inv_order]
+    if shape is not None:
+        x = mx.unflatten(x, 0, shape)
+    return x
+
+
+def gather_mm_simulate(x, w, indices):
+    x, idx, inv_order = gather_sort(x, indices)
+    for i in range(2):
+        y = mx.concatenate([x[i] @ w[j].T for i, j in enumerate(idx.tolist())], axis=0)
+        x = y[:, None]
+    x = scatter_unsort(x, inv_order, indices.shape)
+    return x
+
+
+def time_gather_mm():
+    x = mx.random.normal((N, 1, 1, D)) / 1024**0.5
+    w1 = mx.random.normal((E, M, D)) / 1024**0.5
+    w2 = mx.random.normal((E, D, M)) / 1024**0.5
+    indices = (mx.random.uniform(shape=(N, I)) * E).astype(mx.uint32)
+    sorted_indices = mx.sort(indices.flatten()).reshape(N, I)
+    mx.eval(x, w1, w2, indices, sorted_indices)
+
+    def gather_mm(x, w1, w2, indices, sort):
+        idx = indices
+        inv_order = None
+        if sort:
+            x, idx, inv_order = gather_sort(x, indices)
+        x = mx.gather_mm(x, w1.swapaxes(-1, -2), rhs_indices=idx, sorted_indices=sort)
+        x = mx.gather_mm(x, w2.swapaxes(-1, -2), rhs_indices=idx, sorted_indices=sort)
+        if sort:
+            x = scatter_unsort(x, inv_order, indices.shape)
+        return x
+
+    time_fn(gather_mm, x, w1, w2, indices, False)
+    time_fn(gather_mm, x, w1, w2, sorted_indices, False)
+    time_fn(gather_mm, x, w1, w2, indices, True)
+
+    x = mx.random.normal((N * I, D)) / 1024**0.5
+    w1 = mx.random.normal((M, D)) / 1024**0.5
+    w2 = mx.random.normal((D, M)) / 1024**0.5
+    mx.eval(x, w1, w2)
+
+    def equivalent_matmul(x, w1, w2):
+        x = x @ w1.T
+        x = x @ w2.T
+        return x
+
+    time_fn(equivalent_matmul, x, w1, w2)
+
+
+if __name__ == "__main__":
+    time_gather_mm()
--- a/benchmarks/python/gather_qmm_bench.py
+++ b/benchmarks/python/gather_qmm_bench.py
@ -0,0 +1,84 @@
+# Copyright © 2025 Apple Inc.
+
+import mlx.core as mx
+from time_utils import time_fn
+
+N = 1024
+D = 1024
+M = 1024
+E = 32
+I = 4
+
+
+def gather_sort(x, indices):
+    N, M = indices.shape
+    indices = indices.flatten()
+    order = mx.argsort(indices)
+    inv_order = mx.argsort(order)
+    return x.flatten(0, -3)[order // M], indices[order], inv_order
+
+
+def scatter_unsort(x, inv_order, shape=None):
+    x = x[inv_order]
+    if shape is not None:
+        x = mx.unflatten(x, 0, shape)
+    return x
+
+
+def gather_mm_simulate(x, w, indices):
+    x, idx, inv_order = gather_sort(x, indices)
+    for i in range(2):
+        y = mx.concatenate(
+            [
+                mx.quantized_matmul(x[i], w[0][j], w[1][j], w[2][j], transpose=True)
+                for i, j in enumerate(idx.tolist())
+            ],
+            axis=0,
+        )
+        x = y[:, None]
+    x = scatter_unsort(x, inv_order, indices.shape)
+    return x
+
+
+def time_gather_qmm():
+    x = mx.random.normal((N, 1, 1, D)) / 1024**0.5
+    w1 = mx.random.normal((E, M, D)) / 1024**0.5
+    w2 = mx.random.normal((E, D, M)) / 1024**0.5
+    w1 = mx.quantize(w1)
+    w2 = mx.quantize(w2)
+    indices = (mx.random.uniform(shape=(N, I)) * E).astype(mx.uint32)
+    sorted_indices = mx.sort(indices.flatten()).reshape(N, I)
+    mx.eval(x, w1, w2, indices, sorted_indices)
+
+    def gather_mm(x, w1, w2, indices, sort):
+        idx = indices
+        inv_order = None
+        if sort:
+            x, idx, inv_order = gather_sort(x, indices)
+        x = mx.gather_qmm(x, *w1, transpose=True, rhs_indices=idx, sorted_indices=sort)
+        x = mx.gather_qmm(x, *w2, transpose=True, rhs_indices=idx, sorted_indices=sort)
+        if sort:
+            x = scatter_unsort(x, inv_order, indices.shape)
+        return x
+
+    time_fn(gather_mm, x, w1, w2, indices, False)
+    time_fn(gather_mm, x, w1, w2, sorted_indices, False)
+    time_fn(gather_mm, x, w1, w2, indices, True)
+
+    x = mx.random.normal((N * I, D)) / 1024**0.5
+    w1 = mx.random.normal((M, D)) / 1024**0.5
+    w2 = mx.random.normal((D, M)) / 1024**0.5
+    w1 = mx.quantize(w1)
+    w2 = mx.quantize(w2)
+    mx.eval(x, w1, w2)
+
+    def equivalent_matmul(x, w1, w2):
+        x = mx.quantized_matmul(x, *w1, transpose=True)
+        x = mx.quantized_matmul(x, *w2, transpose=True)
+        return x
+
+    time_fn(equivalent_matmul, x, w1, w2)
+
+
+if __name__ == "__main__":
+    time_gather_qmm()
--- a/benchmarks/python/layer_norm_bench.py
+++ b/benchmarks/python/layer_norm_bench.py
@ -1,5 +1,7 @@
 # Copyright © 2023-2024 Apple Inc.

+from functools import partial
+
 import mlx.core as mx
 import mlx.nn as nn
 from time_utils import time_fn
@ -18,51 +20,63 @@ def layer_norm(x, w, b, eps):
    return y


-def time_layer_norm():
+def time_layer_norm(N, dt):
+    L = 1024
    f1 = lambda x, w, b, y: (layer_norm(x, w, b, 1e-5) * y).sum()
    f2 = lambda x, w, b, y: (mx.fast.layer_norm(x, w, b, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0, 1, 2))
    g2 = mx.grad(f2, argnums=(0, 1, 2))

-    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
-    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    w = mx.random.uniform(shape=(N,)).astype(dt)
+    b = mx.random.uniform(shape=(N,)).astype(dt)
+    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
    mx.eval(x, w, b, y)

-    def layer_norm_loop(g, x, w, b):
+    def layer_norm_loop(f, x, w, b):
+        for _ in range(32):
+            x = f(x, w, b)
+        return x
+
+    time_fn(layer_norm_loop, partial(layer_norm, eps=1e-5), x, w, b)
+    time_fn(layer_norm_loop, partial(mx.fast.layer_norm, eps=1e-5), x, w, b)
+
+    def layer_norm_grad_loop(g, x, w, b):
        gx, gw, gb = x, w, b
        for _ in range(32):
            gx, gw, gb = g(gx, gw, gb, y)
        return gx, gw, gb

-    time_fn(layer_norm_loop, g1, x, w, b)
-    time_fn(layer_norm_loop, g2, x, w, b)
-    time_fn(layer_norm_loop, mx.compile(g1), x, w, b)
-    time_fn(layer_norm_loop, mx.compile(g2), x, w, b)
+    time_fn(layer_norm_grad_loop, g1, x, w, b)
+    time_fn(layer_norm_grad_loop, g2, x, w, b)
+    time_fn(layer_norm_grad_loop, mx.compile(g1), x, w, b)
+    time_fn(layer_norm_grad_loop, mx.compile(g2), x, w, b)

    f1 = lambda x, y: (layer_norm(x, None, None, 1e-5) * y).sum()
    f2 = lambda x, y: (mx.fast.layer_norm(x, None, None, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0,))
    g2 = mx.grad(f2, argnums=(0,))

-    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
-    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
-    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    w = mx.random.uniform(shape=(N,)).astype(dt)
+    b = mx.random.uniform(shape=(N,)).astype(dt)
+    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
    mx.eval(x, w, b, y)

-    def layer_norm_loop(g, x):
+    def layer_norm_grad_x_loop(g, x):
        gx = x
        for _ in range(32):
            gx = g(gx, y)
        return gx

-    time_fn(layer_norm_loop, g1, x)
-    time_fn(layer_norm_loop, g2, x)
-    time_fn(layer_norm_loop, mx.compile(g1), x)
-    time_fn(layer_norm_loop, mx.compile(g2), x)
+    time_fn(layer_norm_grad_x_loop, g1, x)
+    time_fn(layer_norm_grad_x_loop, g2, x)
+    time_fn(layer_norm_grad_x_loop, mx.compile(g1), x)
+    time_fn(layer_norm_grad_x_loop, mx.compile(g2), x)


 if __name__ == "__main__":
-    time_layer_norm()
+    for dt in [mx.float32, mx.float16, mx.bfloat16]:
+        for n in [1024, 2048, 4096, 8192, 8192 + 1024]:
+            print(dt, n)
+            time_layer_norm(n, dt)
--- a/cmake/extension.cmake
+++ b/cmake/extension.cmake
@ -11,13 +11,14 @@ include(CMakeParseArguments)
 # Args: TARGET: Custom target to be added for the metal library TITLE: Name of
 # the .metallib OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib SOURCES: List
 # of source files INCLUDE_DIRS: List of include dirs DEPS: List of dependency
-# files (like headers)
+# files (like headers) DEBUG: Boolean, if true, enables debug compile options
+# for this specific library. If not provided, uses global MLX_METAL_DEBUG.
 #
 # clang format on

 macro(mlx_build_metallib)
  # Parse args
-  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY)
+  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY DEBUG)
  set(multiValueArgs SOURCES INCLUDE_DIRS DEPS)
  cmake_parse_arguments(MTLLIB "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@ -26,6 +27,10 @@ macro(mlx_build_metallib)

  # Collect compile options
  set(MTLLIB_COMPILE_OPTIONS -Wall -Wextra -fno-fast-math -Wno-c++17-extensions)
+  if(MLX_METAL_DEBUG OR MTLLIB_DEBUG)
+    set(MTLLIB_COMPILE_OPTIONS ${MTLLIB_COMPILE_OPTIONS} -gline-tables-only
+                               -frecord-sources)
+  endif()

  # Prepare metallib build command
  add_custom_command(
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@ -10,7 +10,7 @@ import mlx.core as mx
 # -- Project information -----------------------------------------------------

 project = "MLX"
-copyright = "2023, MLX Contributors"
+copyright = "2023, Apple"
 author = "MLX Contributors"
 version = ".".join(mx.__version__.split(".")[:3])
 release = version
--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@ -8,23 +8,26 @@ MLX supports writing custom Metal kernels through the Python and C++ APIs.
 Simple Example
 --------------

+.. currentmodule:: mlx.core
+
 Let's write a custom kernel that computes ``exp`` elementwise:

 .. code-block:: python

-  def exp_elementwise(a: mx.array):
-      source = """
-          uint elem = thread_position_in_grid.x;
-          T tmp = inp[elem];
-          out[elem] = metal::exp(tmp);
-      """
+  source = """
+      uint elem = thread_position_in_grid.x;
+      T tmp = inp[elem];
+      out[elem] = metal::exp(tmp);
+  """

-      kernel = mx.fast.metal_kernel(
-          name="myexp",
-          input_names=["inp"],
-          output_names=["out"],
-          source=source,
-      )
+  kernel = mx.fast.metal_kernel(
+      name="myexp",
+      input_names=["inp"],
+      output_names=["out"],
+      source=source,
+  )
+
+  def exp_elementwise(a: mx.array):
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@ -39,8 +42,13 @@ Let's write a custom kernel that computes ``exp`` elementwise:
  b = exp_elementwise(a)
  assert mx.allclose(b, mx.exp(a))

+Every time you make a kernel, a new Metal library is created and possibly
+JIT compiled. To reduce the overhead from that, build the kernel once with
+:func:`fast.metal_kernel` and then use it many times.
+
 .. note::
-    We are only required to pass the body of the Metal kernel in ``source``.
+   Only pass the body of the Metal kernel in ``source``. The function
+   signature is generated automatically.

 The full function signature will be generated using:

@ -78,44 +86,51 @@ Putting this all together, the generated function signature for ``myexp`` is as

  template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;

-Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads <https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_ function.
-This means we will launch ``mx.prod(grid)`` threads, subdivided into ``threadgroup`` size threadgroups.
-For optimal performance, each thread group dimension should be less than or equal to the corresponding grid dimension.
+Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads
+<https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_
+function. This means we will launch ``mx.prod(grid)`` threads, subdivided into
+``threadgroup`` size threadgroups.  For optimal performance, each thread group
+dimension should be less than or equal to the corresponding grid dimension.

-Passing ``verbose=True`` to ``mx.fast.metal_kernel.__call__`` will print the generated code for debugging purposes.
+Passing ``verbose=True`` to :func:`ast.metal_kernel.__call__` will print the
+generated code for debugging purposes.

 Using Shape/Strides
 -------------------

-``mx.fast.metal_kernel`` supports an argument ``ensure_row_contiguous`` which is ``True`` by default.
-This will copy the ``mx.array`` inputs if needed before the kernel is launched to ensure that the memory layout is row contiguous.
-Generally this makes writing the kernel easier, since we don't have to worry about gaps or the ordering of the dims
-when indexing.
+:func:`fast.metal_kernel` supports an argument ``ensure_row_contiguous`` which
+is ``True`` by default. This will copy the array inputs if needed
+before the kernel is launched to ensure that the memory layout is row
+contiguous.  Generally this makes writing the kernel easier, since we don't
+have to worry about gaps or the ordering of the dims when indexing.

-If we want to avoid this copy, ``metal_kernel`` automatically passes ``a_shape``, ``a_strides`` and ``a_ndim`` for each
-input array ``a`` if any are present in ``source``.
-We can then use MLX's built in indexing utils to fetch the right elements for each thread.
+If we want to avoid this copy, :func:`fast.metal_kernel` automatically passes
+``a_shape``, ``a_strides`` and ``a_ndim`` for each input array ``a`` if any are
+present in ``source``. We can then use MLX's built in indexing utils to fetch
+the right elements for each thread.

-Let's convert ``myexp`` above to support arbitrarily strided arrays without relying on a copy from ``ensure_row_contiguous``:
+Let's convert ``myexp`` above to support arbitrarily strided arrays without
+relying on a copy from ``ensure_row_contiguous``:

 .. code-block:: python
+   
+  source = """
+      uint elem = thread_position_in_grid.x;
+      // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
+      uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
+      T tmp = inp[loc];
+      // Output arrays are always row contiguous
+      out[elem] = metal::exp(tmp);
+  """
+
+  kernel = mx.fast.metal_kernel(
+      name="myexp_strided",
+      input_names=["inp"],
+      output_names=["out"],
+      source=source
+  )

  def exp_elementwise(a: mx.array):
-      source = """
-          uint elem = thread_position_in_grid.x;
-          // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
-          uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
-          T tmp = inp[loc];
-          // Output arrays are always row contiguous
-          out[elem] = metal::exp(tmp);
-      """
-
-      kernel = mx.fast.metal_kernel(
-          name="myexp_strided",
-          input_names=["inp"],
-          output_names=["out"],
-          source=source
-      )
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@ -142,137 +157,139 @@ We'll start with the following MLX implementation using standard ops:

 .. code-block:: python

-    def grid_sample_ref(x, grid):
-        N, H_in, W_in, _ = x.shape
-        ix = ((grid[..., 0] + 1) * W_in - 1) / 2
-        iy = ((grid[..., 1] + 1) * H_in - 1) / 2
+  def grid_sample_ref(x, grid):
+      N, H_in, W_in, _ = x.shape
+      ix = ((grid[..., 0] + 1) * W_in - 1) / 2
+      iy = ((grid[..., 1] + 1) * H_in - 1) / 2

-        ix_nw = mx.floor(ix).astype(mx.int32)
-        iy_nw = mx.floor(iy).astype(mx.int32)
+      ix_nw = mx.floor(ix).astype(mx.int32)
+      iy_nw = mx.floor(iy).astype(mx.int32)

-        ix_ne = ix_nw + 1
-        iy_ne = iy_nw
+      ix_ne = ix_nw + 1
+      iy_ne = iy_nw

-        ix_sw = ix_nw
-        iy_sw = iy_nw + 1
+      ix_sw = ix_nw
+      iy_sw = iy_nw + 1

-        ix_se = ix_nw + 1
-        iy_se = iy_nw + 1
+      ix_se = ix_nw + 1
+      iy_se = iy_nw + 1

-        nw = (ix_se - ix)    * (iy_se - iy)
-        ne = (ix    - ix_sw) * (iy_sw - iy)
-        sw = (ix_ne - ix)    * (iy    - iy_ne)
-        se = (ix    - ix_nw) * (iy    - iy_nw)
+      nw = (ix_se - ix)    * (iy_se - iy)
+      ne = (ix    - ix_sw) * (iy_sw - iy)
+      sw = (ix_ne - ix)    * (iy    - iy_ne)
+      se = (ix    - ix_nw) * (iy    - iy_nw)

-        I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
-        I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
-        I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
-        I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]
+      I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
+      I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
+      I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
+      I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]

-        mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
-        mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
-        mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
-        mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)
+      mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
+      mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
+      mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
+      mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)

-        I_nw *= mask_nw[..., None]
-        I_ne *= mask_ne[..., None]
-        I_sw *= mask_sw[..., None]
-        I_se *= mask_se[..., None]
+      I_nw *= mask_nw[..., None]
+      I_ne *= mask_ne[..., None]
+      I_sw *= mask_sw[..., None]
+      I_se *= mask_se[..., None]

-        output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se
+      output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se

-        return output
+      return output

-Now let's use ``mx.custom_function`` together with ``mx.fast.metal_kernel``
+Now let's use :func:`custom_function` together with :func:`fast.metal_kernel`
 to write a fast GPU kernel for both the forward and backward passes.

 First we'll implement the forward pass as a fused kernel:

 .. code-block:: python

-    @mx.custom_function
-    def grid_sample(x, grid):
+  source = """
+      uint elem = thread_position_in_grid.x;
+      int H = x_shape[1];
+      int W = x_shape[2];
+      int C = x_shape[3];
+      int gH = grid_shape[1];
+      int gW = grid_shape[2];

-        assert x.ndim == 4, "`x` must be 4D."
-        assert grid.ndim == 4, "`grid` must be 4D."
+      int w_stride = C;
+      int h_stride = W * w_stride;
+      int b_stride = H * h_stride;

-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
-        out_shape = (B, gN, gM, C)
+      uint grid_idx = elem / C * 2;
+      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

-        assert D == 2, "Last dim of `grid` must be size 2."
+      int ix_nw = floor(ix);
+      int iy_nw = floor(iy);

-        source = """
-            uint elem = thread_position_in_grid.x;
-            int H = x_shape[1];
-            int W = x_shape[2];
-            int C = x_shape[3];
-            int gH = grid_shape[1];
-            int gW = grid_shape[2];
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;

-            int w_stride = C;
-            int h_stride = W * w_stride;
-            int b_stride = H * h_stride;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;

-            uint grid_idx = elem / C * 2;
-            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;

-            int ix_nw = floor(ix);
-            int iy_nw = floor(iy);
+      T nw = (ix_se - ix)    * (iy_se - iy);
+      T ne = (ix    - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix)    * (iy    - iy_ne);
+      T se = (ix    - ix_nw) * (iy    - iy_nw);

-            int ix_ne = ix_nw + 1;
-            int iy_ne = iy_nw;
+      int batch_idx = elem / C / gH / gW * b_stride;
+      int channel_idx = elem % C;
+      int base_idx = batch_idx + channel_idx;

-            int ix_sw = ix_nw;
-            int iy_sw = iy_nw + 1;
+      T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
+      T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
+      T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
+      T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];

-            int ix_se = ix_nw + 1;
-            int iy_se = iy_nw + 1;
+      I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
+      I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
+      I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
+      I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;

-            T nw = (ix_se - ix)    * (iy_se - iy);
-            T ne = (ix    - ix_sw) * (iy_sw - iy);
-            T sw = (ix_ne - ix)    * (iy    - iy_ne);
-            T se = (ix    - ix_nw) * (iy    - iy_nw);
+      out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
+  """

-            int batch_idx = elem / C / gH / gW * b_stride;
-            int channel_idx = elem % C;
-            int base_idx = batch_idx + channel_idx;
+  kernel = mx.fast.metal_kernel(
+      name="grid_sample",
+      input_names=["x", "grid"],
+      output_names=["out"],
+      source=source,
+  )

-            T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
-            T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
-            T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
-            T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];
+  @mx.custom_function
+  def grid_sample(x, grid):

-            I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
-            I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
-            I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
-            I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;
+      assert x.ndim == 4, "`x` must be 4D."
+      assert grid.ndim == 4, "`grid` must be 4D."

-            out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
-        """
-        kernel = mx.fast.metal_kernel(
-            name="grid_sample",
-            input_names=["x", "grid"],
-            output_names=["out"],
-            source=source,
-        )
-        outputs = kernel(
-            inputs=[x, grid],
-            template=[("T", x.dtype)],
-            output_shapes=[out_shape],
-            output_dtypes=[x.dtype],
-            grid=(np.prod(out_shape), 1, 1),
-            threadgroup=(256, 1, 1),
-        )
-        return outputs[0]
+      B, _, _, C = x.shape
+      _, gN, gM, D = grid.shape
+      out_shape = (B, gN, gM, C)
+
+      assert D == 2, "Last dim of `grid` must be size 2."
+
+      outputs = kernel(
+          inputs=[x, grid],
+          template=[("T", x.dtype)],
+          output_shapes=[out_shape],
+          output_dtypes=[x.dtype],
+          grid=(np.prod(out_shape), 1, 1),
+          threadgroup=(256, 1, 1),
+      )
+      return outputs[0]

 For a reasonably sized input such as:

 .. code-block:: python

-    x.shape = (8, 1024, 1024, 64)
-    grid.shape = (8, 256, 256, 2)
+  x.shape = (8, 1024, 1024, 64)
+  grid.shape = (8, 256, 256, 2)

 On an M1 Max, we see a big performance improvement:

@ -281,11 +298,11 @@ On an M1 Max, we see a big performance improvement:
 Grid Sample VJP
 ---------------

-Since we decorated ``grid_sample`` with ``mx.custom_function``, we can now define
-its custom vjp transform so MLX can differentiate it.
+Since we decorated ``grid_sample`` with :func:`custom_function`, we can now
+define its custom vjp transform so MLX can differentiate it.

 The backwards pass requires atomically updating ``x_grad``/``grid_grad`` and so
-requires a few extra ``mx.fast.metal_kernel`` features:
+requires a few extra :func:`fast.metal_kernel` features:

 * ``init_value=0``
    Initialize all of the kernel's outputs to this value before it runs. This allows us to update only part of the output arrays with the kernel.
@ -299,128 +316,129 @@ We can then implement the backwards pass as follows:

 .. code-block:: python

-    @grid_sample.vjp
-    def grid_sample_vjp(primals, cotangent, _):
-        x, grid = primals
-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
+  source = """
+      uint elem = thread_position_in_grid.x;
+      int H = x_shape[1];
+      int W = x_shape[2];
+      int C = x_shape[3];
+      // Pad C to the nearest larger simdgroup size multiple
+      int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;

-        assert D == 2, "Last dim of `grid` must be size 2."
+      int gH = grid_shape[1];
+      int gW = grid_shape[2];

-        source = """
-            uint elem = thread_position_in_grid.x;
-            int H = x_shape[1];
-            int W = x_shape[2];
-            int C = x_shape[3];
-            // Pad C to the nearest larger simdgroup size multiple
-            int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;
+      int w_stride = C;
+      int h_stride = W * w_stride;
+      int b_stride = H * h_stride;

-            int gH = grid_shape[1];
-            int gW = grid_shape[2];
+      uint grid_idx = elem / C_padded * 2;
+      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

-            int w_stride = C;
-            int h_stride = W * w_stride;
-            int b_stride = H * h_stride;
+      int ix_nw = floor(ix);
+      int iy_nw = floor(iy);

-            uint grid_idx = elem / C_padded * 2;
-            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;

-            int ix_nw = floor(ix);
-            int iy_nw = floor(iy);
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;

-            int ix_ne = ix_nw + 1;
-            int iy_ne = iy_nw;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;

-            int ix_sw = ix_nw;
-            int iy_sw = iy_nw + 1;
+      T nw = (ix_se - ix)    * (iy_se - iy);
+      T ne = (ix    - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix)    * (iy    - iy_ne);
+      T se = (ix    - ix_nw) * (iy    - iy_nw);

-            int ix_se = ix_nw + 1;
-            int iy_se = iy_nw + 1;
+      int batch_idx = elem / C_padded / gH / gW * b_stride;
+      int channel_idx = elem % C_padded;
+      int base_idx = batch_idx + channel_idx;

-            T nw = (ix_se - ix)    * (iy_se - iy);
-            T ne = (ix    - ix_sw) * (iy_sw - iy);
-            T sw = (ix_ne - ix)    * (iy    - iy_ne);
-            T se = (ix    - ix_nw) * (iy    - iy_nw);
+      T gix = T(0);
+      T giy = T(0);
+      if (channel_idx < C) {
+          int cot_index = elem / C_padded * C + channel_idx;
+          T cot = cotangent[cot_index];
+          if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
+              int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);

-            int batch_idx = elem / C_padded / gH / gW * b_stride;
-            int channel_idx = elem % C_padded;
-            int base_idx = batch_idx + channel_idx;
+              T I_nw = x[offset];
+              gix -= I_nw * (iy_se - iy) * cot;
+              giy -= I_nw * (ix_se - ix) * cot;
+          }
+          if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
+              int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);

-            T gix = T(0);
-            T giy = T(0);
-            if (channel_idx < C) {
-                int cot_index = elem / C_padded * C + channel_idx;
-                T cot = cotangent[cot_index];
-                if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
-                    int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);
+              T I_ne = x[offset];
+              gix += I_ne * (iy_sw - iy) * cot;
+              giy -= I_ne * (ix - ix_sw) * cot;
+          }
+          if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
+              int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);

-                    T I_nw = x[offset];
-                    gix -= I_nw * (iy_se - iy) * cot;
-                    giy -= I_nw * (ix_se - ix) * cot;
-                }
-                if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
-                    int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);
+              T I_sw = x[offset];
+              gix -= I_sw * (iy - iy_ne) * cot;
+              giy += I_sw * (ix_ne - ix) * cot;
+          }
+          if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
+              int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
+              atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);

-                    T I_ne = x[offset];
-                    gix += I_ne * (iy_sw - iy) * cot;
-                    giy -= I_ne * (ix - ix_sw) * cot;
-                }
-                if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
-                    int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);
+              T I_se = x[offset];
+              gix += I_se * (iy - iy_nw) * cot;
+              giy += I_se * (ix - ix_nw) * cot;
+          }
+      }

-                    T I_sw = x[offset];
-                    gix -= I_sw * (iy - iy_ne) * cot;
-                    giy += I_sw * (ix_ne - ix) * cot;
-                }
-                if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
-                    int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);
+      T gix_mult = W / 2;
+      T giy_mult = H / 2;

-                    T I_se = x[offset];
-                    gix += I_se * (iy - iy_nw) * cot;
-                    giy += I_se * (ix - ix_nw) * cot;
-                }
-            }
+      // Reduce across each simdgroup first.
+      // This is much faster than relying purely on atomics.
+      gix = simd_sum(gix);
+      giy = simd_sum(giy);

-            T gix_mult = W / 2;
-            T giy_mult = H / 2;
+      if (thread_index_in_simdgroup == 0) {
+          atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
+          atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
+      }
+  """
+  kernel = mx.fast.metal_kernel(
+      name="grid_sample_grad",
+      input_names=["x", "grid", "cotangent"],
+      output_names=["x_grad", "grid_grad"],
+      source=source,
+      atomic_outputs=True,
+  )

-            // Reduce across each simdgroup first.
-            // This is much faster than relying purely on atomics.
-            gix = simd_sum(gix);
-            giy = simd_sum(giy);
+  @grid_sample.vjp
+  def grid_sample_vjp(primals, cotangent, _):
+      x, grid = primals
+      B, _, _, C = x.shape
+      _, gN, gM, D = grid.shape

-            if (thread_index_in_simdgroup == 0) {
-                atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
-                atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
-            }
-        """
-        kernel = mx.fast.metal_kernel(
-            name="grid_sample_grad",
-            input_names=["x", "grid", "cotangent"],
-            output_names=["x_grad", "grid_grad"],
-            source=source,
-            atomic_outputs=True,
-        )
-        # pad the output channels to simd group size
-        # so that our `simd_sum`s don't overlap.
-        simdgroup_size = 32
-        C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
-        grid_size = B * gN * gM * C_padded
-        outputs = kernel(
-            inputs=[x, grid, cotangent],
-            template=[("T", x.dtype)],
-            output_shapes=[x.shape, grid.shape],
-            output_dtypes=[x.dtype, x.dtype],
-            grid=(grid_size, 1, 1),
-            threadgroup=(256, 1, 1),
-            init_value=0,
-        )
-        return outputs[0], outputs[1]
+      assert D == 2, "Last dim of `grid` must be size 2."
+
+      # pad the output channels to simd group size
+      # so that our `simd_sum`s don't overlap.
+      simdgroup_size = 32
+      C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
+      grid_size = B * gN * gM * C_padded
+      outputs = kernel(
+          inputs=[x, grid, cotangent],
+          template=[("T", x.dtype)],
+          output_shapes=[x.shape, grid.shape],
+          output_dtypes=[x.dtype, x.dtype],
+          grid=(grid_size, 1, 1),
+          threadgroup=(256, 1, 1),
+          init_value=0,
+      )
+      return outputs[0], outputs[1]

 There's an even larger speed up for the vjp:

--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@ -397,11 +397,11 @@ below.
        std::ostringstream kname;
        kname << "axpby_" << "general_" << type_to_name(out);

-        // Make sure the metal library is available
-        d.register_library("mlx_ext");
+        // Load the metal library
+        auto lib = d.get_library("mlx_ext");

        // Make a kernel from this metal library
-        auto kernel = d.get_kernel(kname.str(), "mlx_ext");
+        auto kernel = d.get_kernel(kname.str(), lib);

        // Prepare to encode kernel
        auto& compute_encoder = d.get_command_encoder(s.index);
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@ -30,6 +30,16 @@ MLX is also available on conda-forge. To install MLX with conda do:

   conda install conda-forge::mlx

+CUDA
+^^^^
+
+MLX has a CUDA backend which you can use on any Linux platform with CUDA 12
+and SM 7.0 (Volta) and up. To install MLX with CUDA support, run:
+
+.. code-block:: shell
+
+    pip install mlx-cuda
+

 Troubleshooting
 ^^^^^^^^^^^^^^^
@ -65,6 +75,8 @@ Build Requirements
 Python API
 ^^^^^^^^^^

+.. _python install:
+
 To build and install the MLX python library from source, first, clone MLX from
 `its GitHub repo <https://github.com/ml-explore/mlx>`_:

@ -107,6 +119,8 @@ IDE:
 C++ API
 ^^^^^^^

+.. _cpp install:
+
 Currently, MLX must be built and installed from source.

 Similarly to the python library, to build and install the MLX C++ library start
@ -185,6 +199,7 @@ should point to the path to the built metal library.

      xcrun -sdk macosx --show-sdk-version

+
 Binary Size Minimization
 ~~~~~~~~~~~~~~~~~~~~~~~~

@ -213,6 +228,50 @@ be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
 Metal kernel cache persists across reboots.

+Linux
+^^^^^
+
+To build from source on Linux (CPU only), install the BLAS and LAPACK headers.
+For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   apt-get update -y
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+From here follow the instructions to install either the :ref:`Python <python
+install>` or :ref:`C++ <cpp install>` APIs.
+
+CUDA
+^^^^
+
+To build from source on Linux with CUDA, install the BLAS and LAPACK headers
+and the CUDA toolkit. For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+   dpkg -i cuda-keyring_1.1-1_all.deb
+   apt-get update -y
+   apt-get -y install cuda-toolkit-12-9
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+
+When building either the Python or C++ APIs make sure to pass the cmake flag
+``MLX_BUILD_CUDA=ON``. For example, to build the Python API run:
+
+.. code-block:: shell
+
+  CMAKE_BUILD_PARALLEL_LEVEL=8 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"
+
+To build the C++ package run:
+
+.. code-block:: shell
+
+   mkdir -p build && cd build
+   cmake .. -DMLX_BUILD_CUDA=ON && make -j
+
+
 Troubleshooting
 ^^^^^^^^^^^^^^^

--- a/docs/src/python/array.rst
+++ b/docs/src/python/array.rst
@ -19,6 +19,8 @@ Array
    array.ndim
    array.shape
    array.size
+    array.real
+    array.imag
    array.abs
    array.all
    array.any
@ -38,6 +40,7 @@ Array
    array.log10
    array.log1p
    array.log2
+    array.logcumsumexp
    array.logsumexp
    array.max
    array.mean
--- a/docs/src/python/fft.rst
+++ b/docs/src/python/fft.rst
@ -20,3 +20,5 @@ FFT
  irfft2
  rfftn
  irfftn
+  fftshift
+  ifftshift
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@ -16,6 +16,8 @@ Linear Algebra
    cross
    qr
    svd
+    eigvals
+    eig
    eigvalsh
    eigh
    lu
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@ -103,6 +103,7 @@ Operations
   log10
   log1p
   logaddexp
+   logcumsumexp
   logical_not
   logical_and
   logical_or
--- a/docs/src/python/optimizers/common_optimizers.rst
+++ b/docs/src/python/optimizers/common_optimizers.rst
@ -18,3 +18,4 @@ Common Optimizers
   AdamW
   Adamax
   Lion
+   MultiOptimizer
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@ -107,6 +107,16 @@ same array:
  >>> a
  array([1, 2, 0], dtype=int32)

+
+Note, unlike NumPy, updates to the same location are nondeterministic:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> a[[0, 0]] = mx.array([4, 5])
+
+The first element of ``a`` could be ``4`` or ``5``.
+
 Transformations of functions which use in-place updates are allowed and work as
 expected. For example:

--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@ -172,11 +172,11 @@ void Axpby::eval_gpu(
  kname << (contiguous_kernel ? "contiguous_" : "general_");
  kname << type_to_name(out);

-  // Make sure the metal library is available
-  d.register_library("mlx_ext");
+  // Load the metal library
+  auto lib = d.get_library("mlx_ext");

  // Make a kernel from this metal library
-  auto kernel = d.get_kernel(kname.str(), "mlx_ext");
+  auto kernel = d.get_kernel(kname.str(), lib);

  // Prepare to encode kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@ -5,6 +5,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/dtype_utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/export.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
@ -20,7 +21,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h)

 # Define MLX_VERSION only in the version.cpp file.
-add_library(mlx_version STATIC ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp)
+add_library(mlx_version OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp)
 target_compile_definitions(mlx_version PRIVATE MLX_VERSION="${MLX_VERSION}")
 target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:mlx_version>)

@ -48,5 +49,19 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
 if(MLX_BUILD_METAL)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/metal)
 else()
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_metal)
+  target_sources(mlx
+                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/no_metal.cpp)
+endif()
+
+if(MLX_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda)
+else()
+  target_sources(mlx
+                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda/no_cuda.cpp)
+endif()
+
+if(MLX_BUILD_METAL OR MLX_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/gpu)
+else()
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_gpu)
 endif()
--- a/mlx/array.h
+++ b/mlx/array.h
@ -224,6 +224,10 @@ class array {
    // Not copyable
    Data(const Data& d) = delete;
    Data& operator=(const Data& d) = delete;
+    Data(Data&& o) : buffer(o.buffer), d(o.d) {
+      o.buffer = allocator::Buffer(nullptr);
+      o.d = [](allocator::Buffer) {};
+    }
    ~Data() {
      d(buffer);
    }
@ -339,11 +343,11 @@ class array {
    return allocator::allocator().size(buffer());
  }

-  // Return a copy of the shared pointer
-  // to the array::Data struct
-  std::shared_ptr<Data> data_shared_ptr() const {
+  // Return the shared pointer to the array::Data struct
+  const std::shared_ptr<Data>& data_shared_ptr() const {
    return array_desc_->data;
  }
+
  // Return a raw pointer to the arrays data
  template <typename T>
  T* data() {
@ -356,7 +360,7 @@ class array {
  }

  enum Status {
-    // The ouptut of a computation which has not been scheduled.
+    // The output of a computation which has not been scheduled.
    // For example, the status of `x` in `auto x = a + b`.
    unscheduled,

--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@ -1,6 +1,7 @@
 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/broadcasting.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
--- a/mlx/backend/common/broadcasting.cpp
+++ b/mlx/backend/common/broadcasting.cpp
@ -0,0 +1,24 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+void broadcast(const array& in, array& out) {
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+  Strides strides(out.ndim(), 0);
+  int diff = out.ndim() - in.ndim();
+  for (int i = in.ndim() - 1; i >= 0; --i) {
+    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
+  }
+  auto flags = in.flags();
+  if (out.size() > in.size()) {
+    flags.row_contiguous = flags.col_contiguous = false;
+  }
+  out.copy_shared_buffer(in, strides, flags, in.data_size());
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/broadcasting.h
+++ b/mlx/backend/common/broadcasting.h
@ -0,0 +1,11 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+
+namespace mlx::core {
+
+void broadcast(const array& in, array& out);
+
+} // namespace mlx::core
--- a/mlx/backend/common/buffer_cache.h
+++ b/mlx/backend/common/buffer_cache.h
@ -0,0 +1,157 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <cassert>
+#include <functional>
+#include <map>
+
+namespace mlx::core {
+
+template <typename T>
+class BufferCache {
+ public:
+  BufferCache(
+      size_t page_size,
+      std::function<size_t(T*)> get_size,
+      std::function<void(T*)> free)
+      : page_size_(page_size),
+        get_size_(std::move(get_size)),
+        free_(std::move(free)) {}
+
+  ~BufferCache() {
+    clear();
+  }
+
+  BufferCache(const BufferCache&) = delete;
+  BufferCache& operator=(const BufferCache&) = delete;
+
+  T* reuse_from_cache(size_t size) {
+    // Find the closest buffer in pool.
+    auto it = buffer_pool_.lower_bound(size);
+    if (it == buffer_pool_.end() ||
+        it->first >= std::min(2 * size, size + 2 * page_size_)) {
+      return nullptr;
+    }
+
+    // Collect from the cache.
+    T* buf = it->second->buf;
+    pool_size_ -= it->first;
+
+    // Remove from record.
+    remove_from_list(it->second);
+    buffer_pool_.erase(it);
+    return buf;
+  }
+
+  void recycle_to_cache(T* buf) {
+    assert(buf);
+    // Add to cache.
+    BufferHolder* bh = new BufferHolder(buf);
+    add_at_head(bh);
+    size_t size = get_size_(buf);
+    pool_size_ += size;
+    buffer_pool_.emplace(size, bh);
+  }
+
+  int release_cached_buffers(size_t min_bytes_to_free) {
+    if (min_bytes_to_free >= 0.9 * pool_size_) {
+      return clear();
+    } else {
+      int n_release = 0;
+      size_t total_bytes_freed = 0;
+
+      while (tail_ && (total_bytes_freed < min_bytes_to_free)) {
+        // Release buffer.
+        size_t size = get_size_(tail_->buf);
+        total_bytes_freed += size;
+        free_(tail_->buf);
+        n_release++;
+
+        // Remove from record.
+        auto its = buffer_pool_.equal_range(size);
+        auto it = std::find_if(its.first, its.second, [this](const auto& el) {
+          return el.second == tail_;
+        });
+        assert(it != buffer_pool_.end());
+        buffer_pool_.erase(it);
+        remove_from_list(tail_);
+      }
+
+      pool_size_ -= total_bytes_freed;
+      return n_release;
+    }
+  }
+
+  int clear() {
+    int n_release = 0;
+    for (auto& [size, holder] : buffer_pool_) {
+      free_(holder->buf);
+      n_release++;
+      delete holder;
+    }
+    buffer_pool_.clear();
+    pool_size_ = 0;
+    head_ = nullptr;
+    tail_ = nullptr;
+    return n_release;
+  }
+
+  size_t cache_size() const {
+    return pool_size_;
+  }
+
+  size_t page_size() const {
+    return page_size_;
+  }
+
+ private:
+  struct BufferHolder {
+   public:
+    explicit BufferHolder(T* buf_) : buf(buf_) {}
+
+    BufferHolder* prev{nullptr};
+    BufferHolder* next{nullptr};
+    T* buf;
+  };
+
+  void add_at_head(BufferHolder* to_add) {
+    if (!head_) {
+      head_ = to_add;
+      tail_ = to_add;
+    } else {
+      head_->prev = to_add;
+      to_add->next = head_;
+      head_ = to_add;
+    }
+  }
+
+  void remove_from_list(BufferHolder* to_remove) {
+    if (to_remove->prev && to_remove->next) { // if middle
+      to_remove->prev->next = to_remove->next;
+      to_remove->next->prev = to_remove->prev;
+    } else if (to_remove->prev && to_remove == tail_) { // if tail
+      tail_ = to_remove->prev;
+      tail_->next = nullptr;
+    } else if (to_remove == head_ && to_remove->next) { // if head
+      head_ = to_remove->next;
+      head_->prev = nullptr;
+    } else if (to_remove == head_ && to_remove == tail_) { // if only element
+      head_ = nullptr;
+      tail_ = nullptr;
+    }
+
+    delete to_remove;
+  }
+
+  std::multimap<size_t, BufferHolder*> buffer_pool_;
+  BufferHolder* head_{nullptr};
+  BufferHolder* tail_{nullptr};
+  size_t pool_size_{0};
+
+  const size_t page_size_;
+  std::function<size_t(T*)> get_size_;
+  std::function<void(T*)> free_;
+};
+
+} // namespace mlx::core
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@ -1,6 +1,7 @@
 // Copyright © 2024 Apple Inc.
 #include <cassert>

+#include "mlx/backend/common/broadcasting.h"
 #include "mlx/backend/common/utils.h"
 #include "mlx/primitives.h"

@ -42,23 +43,6 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {
  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
 }

-void broadcast(const array& in, array& out) {
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-  Strides strides(out.ndim(), 0);
-  int diff = out.ndim() - in.ndim();
-  for (int i = in.ndim() - 1; i >= 0; --i) {
-    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
-  }
-  auto flags = in.flags();
-  if (out.size() > in.size()) {
-    flags.row_contiguous = flags.col_contiguous = false;
-  }
-  out.copy_shared_buffer(in, strides, flags, in.data_size());
-}
-
 void Broadcast::eval(const std::vector<array>& inputs, array& out) {
  broadcast(inputs[0], out);
 }
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@ -1,8 +1,7 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/backend/common/compiled.h"
-#include "mlx/graph_utils.h"
-#include "mlx/primitives.h"
+#include "mlx/backend/common/utils.h"
 #include "mlx/utils.h"

 namespace mlx::core {
@ -79,55 +78,6 @@ std::string get_type_string(Dtype d) {
  }
 }

-std::string build_lib_name(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs,
-    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids) {
-  NodeNamer namer;
-  std::ostringstream os;
-  std::ostringstream constant_hasher;
-
-  // Fill the input names. This is not really necessary, I just like having A,
-  // B, C, ... as the inputs.
-  for (auto& x : inputs) {
-    namer.get_name(x);
-  }
-
-  // The primitives describing the tape. For unary and binary primitives this
-  // must be enough to describe the full computation.
-  for (auto& a : tape) {
-    // name and type of output
-    os << namer.get_name(a) << kindof(a.dtype()) << a.itemsize();
-    // computation performed
-    a.primitive().print(os);
-    // name of inputs to the function
-    for (auto& inp : a.inputs()) {
-      os << namer.get_name(inp);
-    }
-  }
-  os << "_";
-
-  for (auto& x : inputs) {
-    if (constant_ids.find(x.id()) != constant_ids.end()) {
-      os << "C";
-      print_constant(constant_hasher, x);
-    } else {
-      os << (is_scalar(x) ? "S" : "V");
-    }
-  }
-  os << "_";
-  for (auto& x : inputs) {
-    if (constant_ids.find(x.id()) != constant_ids.end()) {
-      continue;
-    }
-    os << kindof(x.dtype()) << x.itemsize();
-  }
-  os << "_" << std::hash<std::string>{}(constant_hasher.str());
-
-  return os.str();
-}
-
 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
    const Shape& shape) {
@ -159,8 +109,7 @@ bool compiled_check_contiguity(
 void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::vector<array>& inputs_,
-    const std::unordered_set<uintptr_t>& constant_ids_,
+    const std::function<bool(size_t)>& is_constant,
    bool contiguous) {
  if (contiguous) {
    int o = 0;
@ -175,8 +124,7 @@ void compiled_allocate_outputs(
      // - Donatable
      // - Not a constant
      if (in.itemsize() == outputs[o].itemsize() && !is_scalar(in) &&
-          in.is_donatable() &&
-          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
+          in.is_donatable() && is_constant(i)) {
        outputs[o++].copy_shared_buffer(in);
      }
      // Get representative input flags to properly set non-donated outputs
@ -204,7 +152,7 @@ void compiled_allocate_outputs(
      // - Not a constant
      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
-          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
+          is_constant(i)) {
        outputs[o].copy_shared_buffer(
            in, outputs[o].strides(), in.flags(), in.data_size());
        o++;
@ -216,4 +164,74 @@ void compiled_allocate_outputs(
  }
 }

+std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
+    const std::vector<array>& inputs,
+    const array& out,
+    const std::function<bool(size_t)>& is_constant) {
+  const Shape& shape = out.shape();
+  bool contiguous = compiled_check_contiguity(inputs, shape);
+  if (contiguous) {
+    return {true, shape, {}};
+  }
+
+  std::vector<Strides> strides_vec{out.strides()};
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    // Skip constants.
+    if (is_constant(i)) {
+      continue;
+    }
+
+    // Skip scalar inputs.
+    const auto& x = inputs[i];
+    if (is_scalar(x)) {
+      continue;
+    }
+
+    // Broadcast the inputs to the output shape.
+    Strides xstrides;
+    size_t j = 0;
+    for (; j < shape.size() - x.ndim(); ++j) {
+      if (shape[j] == 1) {
+        xstrides.push_back(out.strides()[j]);
+      } else {
+        xstrides.push_back(0);
+      }
+    }
+    for (size_t i = 0; i < x.ndim(); ++i, ++j) {
+      if (x.shape(i) == 1) {
+        if (shape[j] == 1) {
+          xstrides.push_back(out.strides()[j]);
+        } else {
+          xstrides.push_back(0);
+        }
+      } else {
+        xstrides.push_back(x.strides()[i]);
+      }
+    }
+    strides_vec.push_back(std::move(xstrides));
+  }
+
+  auto tup = collapse_contiguous_dims(shape, strides_vec, INT32_MAX);
+  return {false, std::move(std::get<0>(tup)), std::move(std::get<1>(tup))};
+}
+
+bool compiled_use_large_index(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
+    bool contiguous) {
+  if (contiguous) {
+    size_t max_size = 0;
+    for (const auto& in : inputs) {
+      max_size = std::max(max_size, in.data_size());
+    }
+    return max_size > UINT32_MAX;
+  } else {
+    size_t max_size = 0;
+    for (const auto& o : outputs) {
+      max_size = std::max(max_size, o.size());
+    }
+    return max_size > UINT32_MAX;
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/compiled.h
+++ b/mlx/backend/common/compiled.h
@ -1,9 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.
 #pragma once

+#include <functional>
 #include <iomanip>
-#include <sstream>
-#include <unordered_set>

 #include "mlx/array.h"
 #include "mlx/primitives.h"
@ -14,12 +13,6 @@ inline bool is_static_cast(const Primitive& p) {
  return (typeid(p) == typeid(Broadcast) || typeid(p) == typeid(AsType));
 }

-std::string build_lib_name(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs,
-    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids);
-
 std::string get_type_string(Dtype d);

 template <typename T>
@ -60,8 +53,19 @@ bool compiled_check_contiguity(
 void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::vector<array>& inputs_,
-    const std::unordered_set<uintptr_t>& constant_ids_,
+    const std::function<bool(size_t)>& is_constant,
+    bool contiguous);
+
+// Collapse contiguous dims ignoring scalars and constants.
+std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
+    const std::vector<array>& inputs,
+    const array& out,
+    const std::function<bool(size_t)>& is_constant);
+
+// Return whether the kernel should use large index.
+bool compiled_use_large_index(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
    bool contiguous);

 } // namespace mlx::core
--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@ -2,7 +2,7 @@

 #pragma once

-#include "mlx/array.h"
+#include "mlx/backend/common/utils.h"

 namespace mlx::core {

@ -26,7 +26,7 @@ inline bool set_copy_output_data(const array& in, array& out, CopyType ctype) {
  if (ctype == CopyType::Vector) {
    // If the input is donateable, we are doing a vector copy and the types
    // have the same size, then the input buffer can hold the output.
-    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
+    if (is_donatable(in, out)) {
      out.copy_shared_buffer(in);
      return true;
    } else {
--- a/mlx/backend/common/hadamard.h
+++ b/mlx/backend/common/hadamard.h
@ -99,7 +99,11 @@ inline std::pair<int, int> decompose_hadamard(int n) {
          "[hadamard] Only supports n = m*2^k where m in (1, 12, 20, 28).");
    }
  }
+  if (n > (1 << 26)) {
+    throw std::invalid_argument(
+        "[hadamard] Only supports n = m*2^k where k <= 26");
+  }
  return {n, m};
 }

-} // namespace mlx::core
+} // namespace mlx::core
--- a/mlx/backend/common/matmul.h
+++ b/mlx/backend/common/matmul.h
@ -0,0 +1,78 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/utils.h"
+
+#include <sstream>
+
+namespace mlx::core {
+
+inline std::tuple<Shape, Strides, Strides> collapse_batches(
+    const array& a,
+    const array& b) {
+  // Get and check the shape for the batched dims
+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
+  Shape B_bshape{b.shape().begin(), b.shape().end() - 2};
+  if (A_bshape != B_bshape) {
+    std::ostringstream msg;
+    msg << "[matmul] Got matrices with incorrectly broadcasted shapes: " << "A "
+        << a.shape() << ", B " << b.shape() << ".";
+    throw std::runtime_error(msg.str());
+  }
+
+  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
+  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
+
+  auto [batch_shape, batch_strides] =
+      collapse_contiguous_dims(A_bshape, std::vector{A_bstride, B_bstride});
+
+  auto a_batch_strides = batch_strides[0];
+  auto b_batch_strides = batch_strides[1];
+
+  if (batch_shape.empty()) {
+    batch_shape.push_back(1);
+    a_batch_strides.push_back(0);
+    b_batch_strides.push_back(0);
+  }
+
+  return std::make_tuple(batch_shape, a_batch_strides, b_batch_strides);
+}
+
+inline std::tuple<Shape, Strides, Strides, Strides>
+collapse_batches(const array& a, const array& b, const array& c) {
+  // Get and check the shape for the batched dims
+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
+  Shape B_bshape{b.shape().begin(), b.shape().end() - 2};
+  Shape C_bshape{c.shape().begin(), c.shape().end() - 2};
+  if (A_bshape != B_bshape || A_bshape != C_bshape) {
+    std::ostringstream msg;
+    msg << "[addmm] Got matrices with incorrectly broadcasted shapes: " << "A "
+        << a.shape() << ", B " << b.shape() << ", B " << c.shape() << ".";
+    throw std::runtime_error(msg.str());
+  }
+
+  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
+  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
+  Strides C_bstride{c.strides().begin(), c.strides().end() - 2};
+
+  auto [batch_shape, batch_strides] = collapse_contiguous_dims(
+      A_bshape, std::vector{A_bstride, B_bstride, C_bstride});
+
+  auto A_batch_stride = batch_strides[0];
+  auto B_batch_stride = batch_strides[1];
+  auto C_batch_stride = batch_strides[2];
+
+  if (batch_shape.empty()) {
+    batch_shape.push_back(1);
+    A_batch_stride.push_back(0);
+    B_batch_stride.push_back(0);
+    C_batch_stride.push_back(0);
+  }
+
+  return std::make_tuple(
+      batch_shape, A_batch_stride, B_batch_stride, C_batch_stride);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@ -0,0 +1,26 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+inline void set_unary_output_data(const array& in, array& out) {
+  if (in.flags().contiguous) {
+    if (is_donatable(in, out)) {
+      out.copy_shared_buffer(in);
+    } else {
+      out.set_data(
+          allocator::malloc(in.data_size() * out.itemsize()),
+          in.data_size(),
+          in.strides(),
+          in.flags());
+    }
+  } else {
+    out.set_data(allocator::malloc(out.nbytes()));
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@ -1,9 +1,16 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/backend/common/utils.h"
+#include "mlx/primitives.h"

 namespace mlx::core {

+std::string get_primitive_string(Primitive* primitive) {
+  std::ostringstream op_t;
+  primitive->print(op_t);
+  return op_t.str();
+}
+
 std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    const Shape& shape,
    const std::vector<Strides>& strides,
@ -101,4 +108,115 @@ std::pair<Shape, Strides> collapse_contiguous_dims(
  return collapse_contiguous_dims(a.shape(), a.strides(), size_cap);
 }

+Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 /* = 10 */) {
+  int pows[3] = {0, 0, 0};
+  int sum = 0;
+  while (true) {
+    int presum = sum;
+    // Check all the pows
+    if (dim0 >= (1 << (pows[0] + 1))) {
+      pows[0]++;
+      sum++;
+    }
+    if (sum == 10) {
+      break;
+    }
+    if (dim1 >= (1 << (pows[1] + 1))) {
+      pows[1]++;
+      sum++;
+    }
+    if (sum == 10) {
+      break;
+    }
+    if (dim2 >= (1 << (pows[2] + 1))) {
+      pows[2]++;
+      sum++;
+    }
+    if (sum == presum || sum == pow2) {
+      break;
+    }
+  }
+  return std::make_tuple(1ul << pows[0], 1ul << pows[1], 1ul << pows[2]);
+}
+
+Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides) {
+  // Dims with strides of 0 are ignored as they
+  // correspond to broadcasted dimensions
+  size_t grid_x = 1;
+  size_t grid_y = 1;
+  for (int i = 0; i < shape.size(); ++i) {
+    if (strides[i] == 0) {
+      continue;
+    }
+    if (grid_x * shape[i] < UINT32_MAX) {
+      grid_x *= shape[i];
+    } else {
+      grid_y *= shape[i];
+    }
+  }
+  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
+    throw std::runtime_error("Unable to safely factor shape.");
+  }
+  if (grid_y > grid_x) {
+    std::swap(grid_x, grid_y);
+  }
+  return std::make_tuple(
+      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
+}
+
+Dims get_2d_grid_dims_common(
+    const Shape& shape,
+    const Strides& strides,
+    size_t divisor) {
+  // Compute the 2d grid dimensions such that the total size of the grid is
+  // divided by divisor.
+  size_t grid_x = 1;
+  size_t grid_y = 1;
+  for (int i = 0; i < shape.size(); ++i) {
+    if (strides[i] == 0) {
+      continue;
+    }
+
+    // No need to add this shape we can just remove it from the divisor.
+    if (divisor % shape[i] == 0) {
+      divisor /= shape[i];
+      continue;
+    }
+
+    if (grid_x * shape[i] < UINT32_MAX) {
+      grid_x *= shape[i];
+    } else {
+      grid_y *= shape[i];
+    }
+
+    if (divisor > 1) {
+      if (grid_x % divisor == 0) {
+        grid_x /= divisor;
+        divisor = 1;
+      } else if (grid_y % divisor == 0) {
+        grid_y /= divisor;
+        divisor = 1;
+      }
+    }
+  }
+  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX || divisor > 1) {
+    throw std::runtime_error("Unable to safely factor shape.");
+  }
+  if (grid_y > grid_x) {
+    std::swap(grid_x, grid_y);
+  }
+  return std::make_tuple(
+      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
+}
+
+std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2) {
+  auto [bx, by, bz] = get_block_dims_common(dim0, dim1, dim2);
+  auto gx = (dim0 + bx - 1) / bx;
+  auto gy = (dim1 + by - 1) / by;
+  auto gz = (dim2 + bz - 1) / bz;
+
+  return std::make_pair(
+      std::make_tuple(gx, gy, gz), std::make_tuple(bx, by, bz));
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@ -2,12 +2,15 @@

 #pragma once

+#include <tuple>
 #include <vector>

 #include "mlx/array.h"

 namespace mlx::core {

+std::string get_primitive_string(Primitive* primitive);
+
 inline int64_t
 elem_to_loc(int elem, const Shape& shape, const Strides& strides) {
  int64_t loc = 0;
@ -70,6 +73,31 @@ std::pair<Shape, Strides> collapse_contiguous_dims(
    const array& a,
    int64_t size_cap = std::numeric_limits<int32_t>::max());

+// Compute the thread block dimensions which fit the given
+// input dimensions.
+// - The thread block dimensions will be powers of two
+// - The thread block size will be less than 2^pow2
+using Dims = std::tuple<uint32_t, uint32_t, uint32_t>;
+Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 = 10);
+
+// Computes a 2D grid where each element is < UINT_MAX
+// Assumes:
+// - overall size (product of non-broadcasted dimensions) is < UINT_MAX^2
+// - shape and strides correspond to a contiguous (no holes) but
+//   possibly broadcasted array
+Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides);
+
+// Same as above but we do an implicit division with divisor.
+// Basically, equivalent to factorizing
+//    Prod(s \forall s in shape if strides[s] > 0) / divisor.
+Dims get_2d_grid_dims_common(
+    const Shape& shape,
+    const Strides& strides,
+    size_t divisor);
+
+// Get both the block and a grid of blocks that covers dim0, dim1 and dim2.
+std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2);
+
 struct ContiguousIterator {
  inline void step() {
    int dims = shape_.size();
@ -165,4 +193,11 @@ void shared_buffer_reshape(
    const array& in,
    const Strides& out_strides,
    array& out);
+
+template <typename T>
+inline std::vector<T> remove_index(std::vector<T> vec, size_t index) {
+  vec.erase(std::next(vec.begin(), index));
+  return vec;
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/CMakeLists.txt
+++ b/mlx/backend/cpu/CMakeLists.txt
@ -40,11 +40,13 @@ add_dependencies(mlx cpu_compiled_preamble)

 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/available.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/eig.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/encoder.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
@ -74,8 +76,8 @@ target_sources(
 if(MLX_BUILD_ACCELERATE)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/bnns.cpp)
 else()
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/no_fp16.cpp
-                             ${CMAKE_CURRENT_SOURCE_DIR}/gemms/no_bf16.cpp)
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/simd_fp16.cpp
+                             ${CMAKE_CURRENT_SOURCE_DIR}/gemms/simd_bf16.cpp)
 endif()

 if(IOS)
--- a/mlx/backend/cpu/arg_reduce.cpp
+++ b/mlx/backend/cpu/arg_reduce.cpp
@ -14,10 +14,8 @@ template <typename InT, typename OpT>
 void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto axis_size = in.shape()[axis];
  auto axis_stride = in.strides()[axis];
-  Strides strides = in.strides();
-  Shape shape = in.shape();
-  strides.erase(strides.begin() + axis);
-  shape.erase(shape.begin() + axis);
+  Strides strides = remove_index(in.strides(), axis);
+  Shape shape = remove_index(in.shape(), axis);
  auto in_ptr = in.data<InT>();
  auto out_ptr = out.data<uint32_t>();

--- a/mlx/backend/cpu/available.cpp
+++ b/mlx/backend/cpu/available.cpp
@ -0,0 +1,11 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cpu/available.h"
+
+namespace mlx::core::cpu {
+
+bool is_available() {
+  return true;
+}
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/available.h
+++ b/mlx/backend/cpu/available.h
@ -0,0 +1,9 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+namespace mlx::core::cpu {
+
+bool is_available();
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/cpu/binary.cpp
+++ b/mlx/backend/cpu/binary.cpp
@ -172,9 +172,12 @@ void binary_float(
      case bfloat16:
        binary_op<bfloat16_t, Op>(a, b, out, bopt);
        break;
+      case complex64:
+        binary_op<complex64_t, Op>(a, b, out, bopt);
+        break;
      default:
        throw std::runtime_error(
-            "[binary_float] Only supports non-complex floating point types.");
+            "[binary_float] Only supports floating point types.");
    }
  });
 }
--- a/mlx/backend/cpu/compiled.cpp
+++ b/mlx/backend/cpu/compiled.cpp
@ -40,7 +40,10 @@ struct CompilerCache {
  std::shared_mutex mtx;
 };

-static CompilerCache cache{};
+static CompilerCache& cache() {
+  static CompilerCache cache_;
+  return cache_;
+};

 // GPU compile is always available if the GPU is available and since we are in
 // this file CPU compile is also available.
@ -56,14 +59,16 @@ void* compile(
    const std::string& kernel_name,
    const std::function<std::string(void)>& source_builder) {
  {
-    std::shared_lock lock(cache.mtx);
-    if (auto it = cache.kernels.find(kernel_name); it != cache.kernels.end()) {
+    std::shared_lock lock(cache().mtx);
+    if (auto it = cache().kernels.find(kernel_name);
+        it != cache().kernels.end()) {
      return it->second;
    }
  }

-  std::unique_lock lock(cache.mtx);
-  if (auto it = cache.kernels.find(kernel_name); it != cache.kernels.end()) {
+  std::unique_lock lock(cache().mtx);
+  if (auto it = cache().kernels.find(kernel_name);
+      it != cache().kernels.end()) {
    return it->second;
  }
  std::string source_code = source_builder();
@ -120,10 +125,10 @@ void* compile(
  }

  // load library
-  cache.libs.emplace_back(shared_lib_path);
+  cache().libs.emplace_back(shared_lib_path);

  // Load function
-  void* fun = dlsym(cache.libs.back().lib, kernel_name.c_str());
+  void* fun = dlsym(cache().libs.back().lib, kernel_name.c_str());
  if (!fun) {
    std::ostringstream msg;
    msg << "[Compile::eval_cpu] Failed to load compiled function "
@ -131,7 +136,7 @@ void* compile(
        << dlerror();
    throw std::runtime_error(msg.str());
  }
-  cache.kernels.insert({kernel_name, fun});
+  cache().kernels.insert({kernel_name, fun});
  return fun;
 }

@ -141,18 +146,9 @@ inline void build_kernel(
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    const std::vector<array>& tape,
-    const std::unordered_set<uintptr_t>& constant_ids,
+    const std::function<bool(size_t)>& is_constant,
    bool contiguous,
    int ndim) {
-  // All outputs should have the exact same shape and will be row contiguous
-  auto output_shape = outputs[0].shape();
-  auto output_strides = outputs[0].strides();
-
-  // Constants are scalars that are captured by value and cannot change
-  auto is_constant = [&constant_ids](const array& x) {
-    return constant_ids.find(x.id()) != constant_ids.end();
-  };
-
  NodeNamer namer;

 #ifdef _MSC_VER
@ -165,14 +161,15 @@ inline void build_kernel(

  // Add the input arguments
  int cnt = 0;
-  for (auto& x : inputs) {
-    auto& xname = namer.get_name(x);
-
+  for (size_t i = 0; i < inputs.size(); ++i) {
    // Skip constants from the input list
-    if (is_constant(x)) {
+    if (is_constant(i)) {
      continue;
    }

+    const auto& x = inputs[i];
+    auto& xname = namer.get_name(x);
+
    auto tstr = get_type_string(x.dtype());
    os << "  " << tstr << "* " << xname << " = (" << tstr << "*)args[" << cnt++
       << "];" << std::endl;
@ -206,10 +203,11 @@ inline void build_kernel(
  }

  // Read the inputs in tmps
-  for (auto& x : inputs) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const auto& x = inputs[i];
    auto& xname = namer.get_name(x);

-    if (is_constant(x)) {
+    if (is_constant(i)) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = ";
      print_constant(os, x);
      os << ";" << std::endl;
@ -259,8 +257,9 @@ inline void build_kernel(
  } else {
    for (int d = ndim - 1; d >= 0; --d) {
      // Update pointers
-      for (auto& x : inputs) {
-        if (is_constant(x) || is_scalar(x)) {
+      for (size_t i = 0; i < inputs.size(); ++i) {
+        const auto& x = inputs[i];
+        if (is_constant(i) || is_scalar(x)) {
          continue;
        }
        auto& xname = namer.get_name(x);
@ -282,65 +281,37 @@ inline void build_kernel(
 void Compiled::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
-  if (kernel_lib_.empty()) {
-    kernel_lib_ = build_lib_name(inputs_, outputs_, tape_, constant_ids_);
-  }
-
-  // Figure out which kernel we are using
-  auto& shape = outputs[0].shape();
-  auto contiguous = compiled_check_contiguity(inputs, shape);
  auto& encoder = cpu::get_command_encoder(stream());

-  // Handle all broadcasting and collect function input arguments
+  // Collapse contiguous dims to route to a faster kernel if possible. Also
+  // handle all broadcasting.
+  auto [contiguous, shape, strides] =
+      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);
+
+  // Collect function input arguments.
  std::vector<void*> args;
-  std::vector<std::vector<size_t>> strides;
-  for (int i = 0; i < inputs.size(); i++) {
-    // Skip constants.
-    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
+  int strides_index = 1;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (is_constant_(i)) {
      continue;
    }
-    auto& x = inputs[i];
+    const auto& x = inputs[i];
    encoder.set_input_array(x);
    args.push_back((void*)x.data<void>());
-
-    if (contiguous || is_scalar(x)) {
-      continue;
+    if (!contiguous && !is_scalar(x)) {
+      args.push_back(strides[strides_index++].data());
    }
-
-    // Broadcast the input to the output shape.
-    std::vector<size_t> xstrides;
-    int j = 0;
-    for (; j < shape.size() - x.ndim(); j++) {
-      if (shape[j] == 1) {
-        xstrides.push_back(outputs[0].strides()[j]);
-      } else {
-        xstrides.push_back(0);
-      }
-    }
-    for (int i = 0; i < x.ndim(); i++, j++) {
-      if (x.shape(i) == 1) {
-        if (shape[j] == 1) {
-          xstrides.push_back(outputs[0].strides()[j]);
-        } else {
-          xstrides.push_back(0);
-        }
-      } else {
-        xstrides.push_back(x.strides()[i]);
-      }
-    }
-    strides.push_back(std::move(xstrides));
-    args.push_back(strides.back().data());
  }

  // Get the kernel name from the lib
  int ndim = shape.size();
  auto kernel_name = kernel_lib_ + (contiguous ? "_contiguous" : "_strided_");
  if (!contiguous) {
-    kernel_name += std::to_string(shape.size());
+    kernel_name += std::to_string(ndim);
  }

  // Get the function
-  auto fn_ptr = compile(kernel_name, [&]() {
+  auto fn_ptr = compile(kernel_name, [&, contiguous = contiguous]() {
    std::ostringstream kernel;
    kernel << get_kernel_preamble() << std::endl;
    kernel << "extern \"C\"  {" << std::endl;
@ -350,7 +321,7 @@ void Compiled::eval_cpu(
        inputs_,
        outputs_,
        tape_,
-        constant_ids_,
+        is_constant_,
        contiguous,
        ndim);
    // Close extern "C"
@ -358,26 +329,22 @@ void Compiled::eval_cpu(
    return kernel.str();
  });

-  compiled_allocate_outputs(
-      inputs, outputs, inputs_, constant_ids_, contiguous);
+  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);

  for (auto& x : outputs) {
    args.push_back(x.data<void>());
    encoder.set_output_array(x);
  }
-  Shape out_shape;
  if (!contiguous) {
-    out_shape = outputs[0].shape();
-    args.push_back((void*)out_shape.data());
+    args.push_back((void*)shape.data());
  } else {
    args.push_back((void*)outputs[0].data_size());
  }
  auto fun = (void (*)(void**))fn_ptr;
-  encoder.dispatch(
-      [fun,
-       args = std::move(args),
-       strides = std::move(strides),
-       out_shape = std::move(out_shape)]() mutable { fun(args.data()); });
+  encoder.dispatch([fun,
+                    args = std::move(args),
+                    strides = std::move(strides),
+                    shape = std::move(shape)]() mutable { fun(args.data()); });
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/conv.cpp
+++ b/mlx/backend/cpu/conv.cpp
@ -22,7 +22,8 @@ void slow_conv_1D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@ -60,7 +61,8 @@ void slow_conv_1D(
                    out_stride_O = out.strides()[2],

                    flip,
-                    padding = padding[0],
+                    padding_lo = padding_lo[0],
+                    padding_hi = padding_hi[0],
                    wt_stride = wt_strides[0],
                    wt_dilation = wt_dilation[0],
                    in_dilation = in_dilation[0]]() mutable {
@ -77,7 +79,7 @@ void slow_conv_1D(
              const T* wt_ptr = filter_wt_ptr + wh * wt_stride_H;

              int wh_flip = flip ? (wH - wh - 1) : wh;
-              int ih = oh * wt_stride - padding + wh_flip * wt_dilation;
+              int ih = oh * wt_stride - padding_lo + wh_flip * wt_dilation;

              auto ih_div = std::div(ih, in_dilation);

@ -109,7 +111,8 @@ void slow_conv_2D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@ -120,230 +123,235 @@ void slow_conv_2D(
  encoder.set_input_array(wt);
  encoder.set_output_array(out);

-  encoder.dispatch([st_wt_ptr = wt.data<T>(),
-                    st_in_ptr = in.data<T>(),
-                    st_out_ptr = out.data<T>(),
+  encoder.dispatch(
+      [st_wt_ptr = wt.data<T>(),
+       st_in_ptr = in.data<T>(),
+       st_out_ptr = out.data<T>(),

-                    N = in.shape(
-                        0), // Batch size, should be the same as out.shape(0)
-                    iH = 1 +
-                        in_dilation[0] * (in.shape(1) - 1), // Input spatial dim
-                    iW = 1 +
-                        in_dilation[1] * (in.shape(2) - 1), // Input spatial dim
-                    C = in.shape(3), // In channels
-                    oH = out.shape(1), // Output spatial dim
-                    oW = out.shape(2), // Output spatial dim
-                    O = wt.shape(0), // Out channels
-                    wH = wt.shape(1), // Weight spatial dim
-                    wW = wt.shape(2), // Weight spatial dim
+       N = in.shape(0), // Batch size, should be the same as out.shape(0)
+       iH = 1 + in_dilation[0] * (in.shape(1) - 1), // Input spatial dim
+       iW = 1 + in_dilation[1] * (in.shape(2) - 1), // Input spatial dim
+       C = in.shape(3), // In channels
+       oH = out.shape(1), // Output spatial dim
+       oW = out.shape(2), // Output spatial dim
+       O = wt.shape(0), // Out channels
+       wH = wt.shape(1), // Weight spatial dim
+       wW = wt.shape(2), // Weight spatial dim

-                    groups = in.shape(3) / wt.shape(3),
-                    C_per_group = wt.shape(3),
+       groups = in.shape(3) / wt.shape(3),
+       C_per_group = wt.shape(3),

-                    in_stride_N = in.strides()[0],
-                    in_stride_H = in.strides()[1],
-                    in_stride_W = in.strides()[2],
-                    in_stride_C = in.strides()[3],
+       in_stride_N = in.strides()[0],
+       in_stride_H = in.strides()[1],
+       in_stride_W = in.strides()[2],
+       in_stride_C = in.strides()[3],

-                    wt_stride_O = wt.strides()[0],
-                    wt_stride_H = wt.strides()[1],
-                    wt_stride_W = wt.strides()[2],
-                    wt_stride_C = wt.strides()[3],
+       wt_stride_O = wt.strides()[0],
+       wt_stride_H = wt.strides()[1],
+       wt_stride_W = wt.strides()[2],
+       wt_stride_C = wt.strides()[3],

-                    out_stride_N = out.strides()[0],
-                    out_stride_H = out.strides()[1],
-                    out_stride_W = out.strides()[2],
-                    out_stride_O = out.strides()[3],
+       out_stride_N = out.strides()[0],
+       out_stride_H = out.strides()[1],
+       out_stride_W = out.strides()[2],
+       out_stride_O = out.strides()[3],

-                    padding,
-                    wt_strides,
-                    wt_dilation,
-                    in_dilation,
-                    flip]() mutable {
-    bool is_idil_one = in_dilation[0] == 1 && in_dilation[1] == 1;
+       padding_lo,
+       padding_hi,
+       wt_strides,
+       wt_dilation,
+       in_dilation,
+       flip]() mutable {
+        bool is_idil_one = in_dilation[0] == 1 && in_dilation[1] == 1;

-    const int O_per_group = O / groups;
-    auto pt_conv_no_checks = [&](const T* in_ptr,
-                                 const T* wt_ptr,
-                                 T* out_ptr,
-                                 int oh,
-                                 int ow) {
-      out_ptr += oh * out_stride_H + ow * out_stride_W;
-      int ih_base = oh * wt_strides[0] - padding[0];
-      int iw_base = ow * wt_strides[1] - padding[1];
+        const int O_per_group = O / groups;
+        auto pt_conv_no_checks =
+            [&](const T* in_ptr, const T* wt_ptr, T* out_ptr, int oh, int ow) {
+              out_ptr += oh * out_stride_H + ow * out_stride_W;
+              int ih_base = oh * wt_strides[0] - padding_lo[0];
+              int iw_base = ow * wt_strides[1] - padding_lo[1];

-      for (int g = 0; g < groups; ++g) {
-        for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
-          float r = 0.;
+              for (int g = 0; g < groups; ++g) {
+                for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
+                  float r = 0.;

-          for (int wh = 0; wh < wH; ++wh) {
-            for (int ww = 0; ww < wW; ++ww) {
-              int wh_flip = flip ? wH - wh - 1 : wh;
-              int ww_flip = flip ? wW - ww - 1 : ww;
-              int ih = ih_base + wh_flip * wt_dilation[0];
-              int iw = iw_base + ww_flip * wt_dilation[1];
+                  for (int wh = 0; wh < wH; ++wh) {
+                    for (int ww = 0; ww < wW; ++ww) {
+                      int wh_flip = flip ? wH - wh - 1 : wh;
+                      int ww_flip = flip ? wW - ww - 1 : ww;
+                      int ih = ih_base + wh_flip * wt_dilation[0];
+                      int iw = iw_base + ww_flip * wt_dilation[1];

-              const T* wt_ptr_pt = wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
-              const T* in_ptr_pt = in_ptr + ih * in_stride_H + iw * in_stride_W;
+                      const T* wt_ptr_pt =
+                          wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
+                      const T* in_ptr_pt =
+                          in_ptr + ih * in_stride_H + iw * in_stride_W;

-              for (int c = g * C_per_group; c < (g + 1) * C_per_group; ++c) {
-                r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
-                    static_cast<float>(
-                         wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
-              } // c
-            } // ww
-          } // wh
+                      for (int c = g * C_per_group; c < (g + 1) * C_per_group;
+                           ++c) {
+                        r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
+                            static_cast<float>(
+                                 wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
+                      } // c
+                    } // ww
+                  } // wh

-          out_ptr[0] = static_cast<T>(r);
-          out_ptr += out_stride_O;
-          wt_ptr += wt_stride_O;
-        } // o
-      } // g
-    };
+                  out_ptr[0] = static_cast<T>(r);
+                  out_ptr += out_stride_O;
+                  wt_ptr += wt_stride_O;
+                } // o
+              } // g
+            };

-    int jump_h = flip ? -wt_dilation[0] : wt_dilation[0];
-    int jump_w = flip ? -wt_dilation[1] : wt_dilation[1];
+        int jump_h = flip ? -wt_dilation[0] : wt_dilation[0];
+        int jump_w = flip ? -wt_dilation[1] : wt_dilation[1];

-    int init_h = (flip ? (wH - 1) * wt_dilation[0] : 0);
-    int init_w = (flip ? (wW - 1) * wt_dilation[1] : 0);
+        int init_h = (flip ? (wH - 1) * wt_dilation[0] : 0);
+        int init_w = (flip ? (wW - 1) * wt_dilation[1] : 0);

-    int f_wgt_jump_h =
-        std::lcm(in_dilation[0], wt_dilation[0]) / wt_dilation[0];
-    int f_wgt_jump_w =
-        std::lcm(in_dilation[1], wt_dilation[1]) / wt_dilation[1];
+        int f_wgt_jump_h =
+            std::lcm(in_dilation[0], wt_dilation[0]) / wt_dilation[0];
+        int f_wgt_jump_w =
+            std::lcm(in_dilation[1], wt_dilation[1]) / wt_dilation[1];

-    int f_out_jump_h = std::lcm(in_dilation[0], wt_strides[0]) / wt_strides[0];
-    int f_out_jump_w = std::lcm(in_dilation[1], wt_strides[1]) / wt_strides[1];
+        int f_out_jump_h =
+            std::lcm(in_dilation[0], wt_strides[0]) / wt_strides[0];
+        int f_out_jump_w =
+            std::lcm(in_dilation[1], wt_strides[1]) / wt_strides[1];

-    std::vector<int> base_h(f_out_jump_h);
-    std::vector<int> base_w(f_out_jump_w);
+        std::vector<int> base_h(f_out_jump_h);
+        std::vector<int> base_w(f_out_jump_w);

-    for (int i = 0; i < f_out_jump_h; ++i) {
-      int ih_loop = i * wt_strides[0] - padding[0] + init_h;
+        for (int i = 0; i < f_out_jump_h; ++i) {
+          int ih_loop = i * wt_strides[0] - padding_lo[0] + init_h;

-      int wh_base = 0;
-      while (wh_base < wH && ih_loop % in_dilation[0] != 0) {
-        wh_base++;
-        ih_loop += jump_h;
-      }
+          int wh_base = 0;
+          while (wh_base < wH && ih_loop % in_dilation[0] != 0) {
+            wh_base++;
+            ih_loop += jump_h;
+          }

-      base_h[i] = wh_base;
-    }
+          base_h[i] = wh_base;
+        }

-    for (int j = 0; j < f_out_jump_w; ++j) {
-      int iw_loop = j * wt_strides[1] - padding[1] + init_w;
+        for (int j = 0; j < f_out_jump_w; ++j) {
+          int iw_loop = j * wt_strides[1] - padding_lo[1] + init_w;

-      int ww_base = 0;
-      while (ww_base < wW && iw_loop % in_dilation[1] != 0) {
-        ww_base++;
-        iw_loop += jump_w;
-      }
+          int ww_base = 0;
+          while (ww_base < wW && iw_loop % in_dilation[1] != 0) {
+            ww_base++;
+            iw_loop += jump_w;
+          }

-      base_w[j] = ww_base;
-    }
+          base_w[j] = ww_base;
+        }

-    auto pt_conv_all_checks =
-        [&](const T* in_ptr, const T* wt_ptr, T* out_ptr, int oh, int ow) {
-          out_ptr += oh * out_stride_H + ow * out_stride_W;
+        auto pt_conv_all_checks =
+            [&](const T* in_ptr, const T* wt_ptr, T* out_ptr, int oh, int ow) {
+              out_ptr += oh * out_stride_H + ow * out_stride_W;

-          int ih_base = oh * wt_strides[0] - padding[0];
-          int iw_base = ow * wt_strides[1] - padding[1];
+              int ih_base = oh * wt_strides[0] - padding_lo[0];
+              int iw_base = ow * wt_strides[1] - padding_lo[1];

-          int wh_base = base_h[oh % f_out_jump_h];
-          int ww_base = base_w[ow % f_out_jump_w];
+              int wh_base = base_h[oh % f_out_jump_h];
+              int ww_base = base_w[ow % f_out_jump_w];

-          for (int g = 0; g < groups; ++g) {
-            for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
-              float r = 0.;
+              for (int g = 0; g < groups; ++g) {
+                for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
+                  float r = 0.;

-              for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
-                for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
-                  int wh_flip = flip ? wH - wh - 1 : wh;
-                  int ww_flip = flip ? wW - ww - 1 : ww;
-                  int ih = ih_base + wh_flip * wt_dilation[0];
-                  int iw = iw_base + ww_flip * wt_dilation[1];
+                  for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
+                    for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
+                      int wh_flip = flip ? wH - wh - 1 : wh;
+                      int ww_flip = flip ? wW - ww - 1 : ww;
+                      int ih = ih_base + wh_flip * wt_dilation[0];
+                      int iw = iw_base + ww_flip * wt_dilation[1];

-                  if (ih >= 0 && ih < iH && iw >= 0 && iw < iW) {
-                    const T* wt_ptr_pt =
-                        wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
+                      if (ih >= 0 && ih < iH && iw >= 0 && iw < iW) {
+                        const T* wt_ptr_pt =
+                            wt_ptr + wh * wt_stride_H + ww * wt_stride_W;

-                    int ih_dil = !is_idil_one ? (ih / in_dilation[0]) : ih;
-                    int iw_dil = !is_idil_one ? (iw / in_dilation[1]) : iw;
+                        int ih_dil = !is_idil_one ? (ih / in_dilation[0]) : ih;
+                        int iw_dil = !is_idil_one ? (iw / in_dilation[1]) : iw;

-                    const T* in_ptr_pt =
-                        in_ptr + ih_dil * in_stride_H + iw_dil * in_stride_W;
+                        const T* in_ptr_pt = in_ptr + ih_dil * in_stride_H +
+                            iw_dil * in_stride_W;

-                    for (int c = g * C_per_group; c < (g + 1) * C_per_group;
-                         ++c) {
-                      r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
-                          static_cast<float>(
-                               wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
-                    } // c
+                        for (int c = g * C_per_group; c < (g + 1) * C_per_group;
+                             ++c) {
+                          r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
+                              static_cast<float>(
+                                   wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
+                        } // c

-                  } // ih, iw check
-                } // ww
-              } // wh
+                      } // ih, iw check
+                    } // ww
+                  } // wh

-              out_ptr[0] = static_cast<T>(r);
-              out_ptr += out_stride_O;
-              wt_ptr += wt_stride_O;
-            } // o
-          } // g
-        };
+                  out_ptr[0] = static_cast<T>(r);
+                  out_ptr += out_stride_O;
+                  wt_ptr += wt_stride_O;
+                } // o
+              } // g
+            };

-    int oH_border_0 = 0;
-    int oH_border_1 =
-        is_idil_one ? ((padding[0] + wt_strides[0] - 1) / wt_strides[0]) : oH;
-    int oH_border_2 = std::max(
-        oH_border_1, (iH + padding[0] - wH * wt_dilation[0]) / wt_strides[0]);
-    int oH_border_3 = oH;
+        int oH_border_0 = 0;
+        int oH_border_1 = is_idil_one
+            ? ((padding_lo[0] + wt_strides[0] - 1) / wt_strides[0])
+            : oH;
+        int oH_border_2 = std::max(
+            oH_border_1,
+            (iH + padding_lo[0] - wH * wt_dilation[0]) / wt_strides[0]);
+        int oH_border_3 = oH;

-    int oW_border_0 = 0;
-    int oW_border_1 =
-        is_idil_one ? ((padding[1] + wt_strides[1] - 1) / wt_strides[1]) : oW;
-    int oW_border_2 = std::max(
-        oW_border_1, (iW + padding[1] - wW * wt_dilation[1]) / wt_strides[1]);
-    int oW_border_3 = oW;
+        int oW_border_0 = 0;
+        int oW_border_1 = is_idil_one
+            ? ((padding_lo[1] + wt_strides[1] - 1) / wt_strides[1])
+            : oW;
+        int oW_border_2 = std::max(
+            oW_border_1,
+            (iW + padding_lo[1] - wW * wt_dilation[1]) / wt_strides[1]);
+        int oW_border_3 = oW;

-    for (int n = 0; n < N; ++n) {
-      // Case 1: oh might put us out of bounds
-      for (int oh = oH_border_0; oh < oH_border_1; ++oh) {
-        for (int ow = 0; ow < oW; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
-        } // ow
-      } // oh
+        for (int n = 0; n < N; ++n) {
+          // Case 1: oh might put us out of bounds
+          for (int oh = oH_border_0; oh < oH_border_1; ++oh) {
+            for (int ow = 0; ow < oW; ++ow) {
+              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
+            } // ow
+          } // oh

-      // Case 2: oh in bounds
-      for (int oh = oH_border_1; oh < oH_border_2; ++oh) {
-        // Case a: ow might put us out of bounds
-        for (int ow = oW_border_0; ow < oW_border_1; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
-        } // ow
+          // Case 2: oh in bounds
+          for (int oh = oH_border_1; oh < oH_border_2; ++oh) {
+            // Case a: ow might put us out of bounds
+            for (int ow = oW_border_0; ow < oW_border_1; ++ow) {
+              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
+            } // ow

-        // Case b: ow in bounds
-        for (int ow = oW_border_1; ow < oW_border_2; ++ow) {
-          pt_conv_no_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
-        } // ow
+            // Case b: ow in bounds
+            for (int ow = oW_border_1; ow < oW_border_2; ++ow) {
+              pt_conv_no_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
+            } // ow

-        // Case c: ow might put us out of bounds
-        for (int ow = oW_border_2; ow < oW_border_3; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
-        } // ow
+            // Case c: ow might put us out of bounds
+            for (int ow = oW_border_2; ow < oW_border_3; ++ow) {
+              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
+            } // ow

-      } // oh
+          } // oh

-      // Case 3: oh might put us out of bounds
-      for (int oh = oH_border_2; oh < oH_border_3; ++oh) {
-        for (int ow = 0; ow < oW; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
-        } // ow
-      } // oh
+          // Case 3: oh might put us out of bounds
+          for (int oh = oH_border_2; oh < oH_border_3; ++oh) {
+            for (int ow = 0; ow < oW; ++ow) {
+              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
+            } // ow
+          } // oh

-      st_in_ptr += in_stride_N;
-      st_out_ptr += out_stride_N;
+          st_in_ptr += in_stride_N;
+          st_out_ptr += out_stride_N;

-    } // n
-  });
+        } // n
+      });
 }

 template <typename T>
@ -351,7 +359,8 @@ void slow_conv_3D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@ -400,7 +409,8 @@ void slow_conv_3D(
                    out_stride_H = out.strides()[2],
                    out_stride_W = out.strides()[3],
                    out_stride_O = out.strides()[4],
-                    padding,
+                    padding_lo,
+                    padding_hi,
                    wt_strides,
                    wt_dilation,
                    in_dilation,
@ -415,9 +425,9 @@ void slow_conv_3D(
                                 int oh,
                                 int ow) {
      out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;
-      int id_base = od * wt_strides[0] - padding[0];
-      int ih_base = oh * wt_strides[1] - padding[1];
-      int iw_base = ow * wt_strides[2] - padding[2];
+      int id_base = od * wt_strides[0] - padding_lo[0];
+      int ih_base = oh * wt_strides[1] - padding_lo[1];
+      int iw_base = ow * wt_strides[2] - padding_lo[2];

      for (int o = 0; o < O; ++o) {
        float r = 0.;
@ -478,7 +488,7 @@ void slow_conv_3D(
    std::vector<int> base_w(f_out_jump_w);

    for (int i = 0; i < f_out_jump_d; ++i) {
-      int id_loop = i * wt_strides[0] - padding[0] + init_d;
+      int id_loop = i * wt_strides[0] - padding_lo[0] + init_d;

      int wd_base = 0;
      while (wd_base < wD && id_loop % in_dilation[0] != 0) {
@ -490,7 +500,7 @@ void slow_conv_3D(
    }

    for (int i = 0; i < f_out_jump_h; ++i) {
-      int ih_loop = i * wt_strides[1] - padding[1] + init_h;
+      int ih_loop = i * wt_strides[1] - padding_lo[1] + init_h;

      int wh_base = 0;
      while (wh_base < wH && ih_loop % in_dilation[1] != 0) {
@ -502,7 +512,7 @@ void slow_conv_3D(
    }

    for (int j = 0; j < f_out_jump_w; ++j) {
-      int iw_loop = j * wt_strides[2] - padding[2] + init_w;
+      int iw_loop = j * wt_strides[2] - padding_lo[2] + init_w;

      int ww_base = 0;
      while (ww_base < wW && iw_loop % in_dilation[2] != 0) {
@ -521,9 +531,9 @@ void slow_conv_3D(
                                  int ow) {
      out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;

-      int id_base = od * wt_strides[0] - padding[0];
-      int ih_base = oh * wt_strides[1] - padding[1];
-      int iw_base = ow * wt_strides[2] - padding[2];
+      int id_base = od * wt_strides[0] - padding_lo[0];
+      int ih_base = oh * wt_strides[1] - padding_lo[1];
+      int iw_base = ow * wt_strides[2] - padding_lo[2];

      int wd_base = base_d[od % f_out_jump_d];
      int wh_base = base_h[oh % f_out_jump_h];
@ -573,24 +583,30 @@ void slow_conv_3D(
    };

    int oD_border_0 = 0;
-    int oD_border_1 =
-        is_idil_one ? ((padding[0] + wt_strides[0] - 1) / wt_strides[0]) : oD;
+    int oD_border_1 = is_idil_one
+        ? ((padding_lo[0] + wt_strides[0] - 1) / wt_strides[0])
+        : oD;
    int oD_border_2 = std::max(
-        oD_border_1, (iD + padding[0] - wD * wt_dilation[0]) / wt_strides[0]);
+        oD_border_1,
+        (iD + padding_lo[0] - wD * wt_dilation[0]) / wt_strides[0]);
    int oD_border_3 = oD;

    int oH_border_0 = 0;
-    int oH_border_1 =
-        is_idil_one ? ((padding[1] + wt_strides[1] - 1) / wt_strides[1]) : oH;
+    int oH_border_1 = is_idil_one
+        ? ((padding_lo[1] + wt_strides[1] - 1) / wt_strides[1])
+        : oH;
    int oH_border_2 = std::max(
-        oH_border_1, (iH + padding[1] - wH * wt_dilation[1]) / wt_strides[1]);
+        oH_border_1,
+        (iH + padding_lo[1] - wH * wt_dilation[1]) / wt_strides[1]);
    int oH_border_3 = oH;

    int oW_border_0 = 0;
-    int oW_border_1 =
-        is_idil_one ? ((padding[2] + wt_strides[2] - 1) / wt_strides[2]) : oW;
+    int oW_border_1 = is_idil_one
+        ? ((padding_lo[2] + wt_strides[2] - 1) / wt_strides[2])
+        : oW;
    int oW_border_2 = std::max(
-        oW_border_1, (iW + padding[2] - wW * wt_dilation[2]) / wt_strides[2]);
+        oW_border_1,
+        (iW + padding_lo[2] - wW * wt_dilation[2]) / wt_strides[2]);
    int oW_border_3 = oW;

    for (int n = 0; n < N; ++n) {
@ -658,7 +674,8 @@ void dispatch_slow_conv_1D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@ -669,7 +686,8 @@ void dispatch_slow_conv_1D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@ -680,7 +698,8 @@ void dispatch_slow_conv_1D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@ -691,7 +710,8 @@ void dispatch_slow_conv_1D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@ -707,7 +727,8 @@ void dispatch_slow_conv_2D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@ -718,7 +739,8 @@ void dispatch_slow_conv_2D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@ -729,7 +751,8 @@ void dispatch_slow_conv_2D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@ -740,7 +763,8 @@ void dispatch_slow_conv_2D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@ -756,7 +780,8 @@ void dispatch_slow_conv_3D(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@ -767,7 +792,8 @@ void dispatch_slow_conv_3D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@ -778,7 +804,8 @@ void dispatch_slow_conv_3D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@ -789,7 +816,8 @@ void dispatch_slow_conv_3D(
        in,
        wt,
        out,
-        padding,
+        padding_lo,
+        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
@ -829,7 +857,8 @@ void explicit_gemm_conv_1D_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    Stream stream) {
@ -848,7 +877,7 @@ void explicit_gemm_conv_1D_cpu(
  auto& encoder = cpu::get_command_encoder(stream);

  // Pad input
-  Shape padded_shape = {N, iH + 2 * padding[0], C};
+  Shape padded_shape = {N, iH + padding_lo[0] + padding_hi[0], C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
@ -857,7 +886,7 @@ void explicit_gemm_conv_1D_cpu(
  copy(temps.back(), in_padded, CopyType::Scalar, stream);

  // Pick input slice from padded
-  size_t data_offset = padding[0] * in_padded.strides()[1];
+  size_t data_offset = padding_lo[0] * in_padded.strides()[1];
  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
@ -971,7 +1000,8 @@ void explicit_gemm_conv_2D_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    Stream stream) {
@ -989,7 +1019,11 @@ void explicit_gemm_conv_2D_cpu(
  auto& encoder = cpu::get_command_encoder(stream);

  // Pad input
-  Shape padded_shape = {N, iH + 2 * padding[0], iW + 2 * padding[1], C};
+  Shape padded_shape = {
+      N,
+      iH + padding_lo[0] + padding_hi[0],
+      iW + padding_lo[1] + padding_hi[1],
+      C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
@ -998,8 +1032,8 @@ void explicit_gemm_conv_2D_cpu(
  copy(temps.back(), in_padded, CopyType::Scalar, stream);

  // Pick input slice from padded
-  size_t data_offset =
-      padding[0] * in_padded.strides()[1] + padding[1] * in_padded.strides()[2];
+  size_t data_offset = padding_lo[0] * in_padded.strides()[1] +
+      padding_lo[1] * in_padded.strides()[2];
  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
@ -1091,7 +1125,8 @@ void explicit_gemm_conv_ND_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const bool flip,
@ -1114,7 +1149,7 @@ void explicit_gemm_conv_ND_cpu(
  Shape padded_shape(in.shape().size());
  padded_shape.front() = N;
  for (size_t i = 0; i < iDim.size(); i++) {
-    padded_shape[i + 1] = iDim[i] + 2 * padding[i];
+    padded_shape[i + 1] = iDim[i] + padding_lo[i] + padding_hi[i];
  }
  padded_shape.back() = C;
  array in_padded(padded_shape, conv_dtype, nullptr, {});
@ -1125,9 +1160,10 @@ void explicit_gemm_conv_ND_cpu(

  // Pick input slice from padded
  size_t data_offset = 0;
-  for (size_t i = 0; i < padding.size(); i++) {
-    data_offset += padding[i] * in_padded.strides()[i + 1];
+  for (size_t i = 0; i < padding_lo.size(); i++) {
+    data_offset += padding_lo[i] * in_padded.strides()[i + 1];
  }
+
  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
@ -1261,7 +1297,8 @@ void conv_1D_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@ -1270,22 +1307,40 @@ void conv_1D_cpu(
  const int groups = in.shape().back() / wt.shape().back();
  if (wt_dilation[0] == 1 && in_dilation[0] == 1 && !flip) {
    return explicit_gemm_conv_1D_cpu(
-        in, wt, out, padding, wt_strides, wt_dilation, stream);
+        in, wt, out, padding_lo, padding_hi, wt_strides, wt_dilation, stream);
  }
  if (wt_dilation[0] == 1 && in_dilation[0] == 1 && groups == 1) {
    return explicit_gemm_conv_ND_cpu(
-        in, wt, out, padding, wt_strides, wt_dilation, flip, stream);
+        in,
+        wt,
+        out,
+        padding_lo,
+        padding_hi,
+        wt_strides,
+        wt_dilation,
+        flip,
+        stream);
  }

  return dispatch_slow_conv_1D(
-      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip, stream);
+      in,
+      wt,
+      out,
+      padding_lo,
+      padding_hi,
+      wt_strides,
+      wt_dilation,
+      in_dilation,
+      flip,
+      stream);
 }

 void conv_2D_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@ -1295,18 +1350,35 @@ void conv_2D_cpu(
  if (wt_dilation[0] == 1 && wt_dilation[1] == 1 && in_dilation[0] == 1 &&
      in_dilation[1] == 1 && groups == 1) {
    return explicit_gemm_conv_ND_cpu(
-        in, wt, out, padding, wt_strides, wt_dilation, flip, stream);
+        in,
+        wt,
+        out,
+        padding_lo,
+        padding_hi,
+        wt_strides,
+        wt_dilation,
+        flip,
+        stream);
  }
-
  return dispatch_slow_conv_2D(
-      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip, stream);
+      in,
+      wt,
+      out,
+      padding_lo,
+      padding_hi,
+      wt_strides,
+      wt_dilation,
+      in_dilation,
+      flip,
+      stream);
 }

 void conv_3D_cpu(
    const array& in,
    const array& wt,
    array out,
-    const std::vector<int>& padding,
+    const std::vector<int>& padding_lo,
+    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
@ -1317,11 +1389,28 @@ void conv_3D_cpu(
      in_dilation[0] == 1 && in_dilation[1] == 1 && in_dilation[2] == 1 &&
      groups == 1) {
    return explicit_gemm_conv_ND_cpu(
-        in, wt, out, padding, wt_strides, wt_dilation, flip, stream);
+        in,
+        wt,
+        out,
+        padding_lo,
+        padding_hi,
+        wt_strides,
+        wt_dilation,
+        flip,
+        stream);
  }

  return dispatch_slow_conv_3D(
-      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip, stream);
+      in,
+      wt,
+      out,
+      padding_lo,
+      padding_hi,
+      wt_strides,
+      wt_dilation,
+      in_dilation,
+      flip,
+      stream);
 }

 } // namespace
@ -1338,7 +1427,8 @@ void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
        in,
        wt,
        out,
-        padding_,
+        padding_lo_,
+        padding_hi_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
@ -1351,7 +1441,8 @@ void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
        in,
        wt,
        out,
-        padding_,
+        padding_lo_,
+        padding_hi_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
@ -1364,7 +1455,8 @@ void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
        in,
        wt,
        out,
-        padding_,
+        padding_lo_,
+        padding_hi_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
--- a/mlx/backend/cpu/distributed.cpp
+++ b/mlx/backend/cpu/distributed.cpp
@ -46,8 +46,15 @@ void AllReduce::eval_cpu(
    case Sum:
      distributed::detail::all_sum(group(), in, outputs[0], stream());
      break;
+    case Max:
+      distributed::detail::all_max(group(), in, outputs[0], stream());
+      break;
+    case Min:
+      distributed::detail::all_min(group(), in, outputs[0], stream());
+      break;
    default:
-      throw std::runtime_error("Only all reduce sum is supported for now");
+      throw std::runtime_error(
+          "Only all reduce sum, min and max are supported for now");
  }
 }

--- a/mlx/backend/cpu/eig.cpp
+++ b/mlx/backend/cpu/eig.cpp
@ -0,0 +1,174 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/allocator.h"
+#include "mlx/array.h"
+#include "mlx/backend/cpu/copy.h"
+#include "mlx/backend/cpu/encoder.h"
+#include "mlx/backend/cpu/lapack.h"
+#include "mlx/linalg.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T>
+void eig_impl(
+    array& a,
+    array& vectors,
+    array& values,
+    bool compute_eigenvectors,
+    Stream stream) {
+  using OT = std::complex<T>;
+  auto a_ptr = a.data<T>();
+  auto eig_ptr = values.data<OT>();
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_output_array(values);
+  OT* vec_ptr = nullptr;
+  if (compute_eigenvectors) {
+    encoder.set_output_array(vectors);
+    vec_ptr = vectors.data<OT>();
+  }
+  encoder.dispatch([a_ptr,
+                    vec_ptr,
+                    eig_ptr,
+                    compute_eigenvectors,
+                    N = vectors.shape(-1),
+                    size = vectors.size()]() mutable {
+    // Work query
+    char jobr = 'N';
+    char jobl = compute_eigenvectors ? 'V' : 'N';
+    int n_vecs_r = 1;
+    int n_vecs_l = compute_eigenvectors ? N : 1;
+    int lwork = -1;
+    int info;
+    {
+      T work;
+      int iwork;
+      geev<T>(
+          &jobl,
+          &jobr,
+          &N,
+          nullptr,
+          &N,
+          nullptr,
+          nullptr,
+          nullptr,
+          &n_vecs_l,
+          nullptr,
+          &n_vecs_r,
+          &work,
+          &lwork,
+          &info);
+      lwork = static_cast<int>(work);
+    }
+
+    auto eig_tmp_data = array::Data{allocator::malloc(sizeof(T) * N * 2)};
+    auto vec_tmp_data =
+        array::Data{allocator::malloc(vec_ptr ? sizeof(T) * N * N * 2 : 0)};
+    auto eig_tmp = static_cast<T*>(eig_tmp_data.buffer.raw_ptr());
+    auto vec_tmp = static_cast<T*>(vec_tmp_data.buffer.raw_ptr());
+    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
+    for (size_t i = 0; i < size / (N * N); ++i) {
+      geev<T>(
+          &jobl,
+          &jobr,
+          &N,
+          a_ptr,
+          &N,
+          eig_tmp,
+          eig_tmp + N,
+          vec_tmp,
+          &n_vecs_l,
+          nullptr,
+          &n_vecs_r,
+          static_cast<T*>(work_buf.buffer.raw_ptr()),
+          &lwork,
+          &info);
+      for (int i = 0; i < N; ++i) {
+        eig_ptr[i] = {eig_tmp[i], eig_tmp[N + i]};
+      }
+      if (vec_ptr) {
+        for (int i = 0; i < N; ++i) {
+          if (eig_ptr[i].imag() != 0) {
+            // This vector and the next are a pair
+            for (int j = 0; j < N; ++j) {
+              vec_ptr[i * N + j] = {
+                  vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]};
+              vec_ptr[(i + 1) * N + j] = {
+                  vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]};
+            }
+            i += 1;
+          } else {
+            for (int j = 0; j < N; ++j) {
+              vec_ptr[i * N + j] = {vec_tmp[i * N + j], 0};
+            }
+          }
+        }
+        vec_ptr += N * N;
+      }
+      a_ptr += N * N;
+      eig_ptr += N;
+      if (info != 0) {
+        std::stringstream msg;
+        msg << "[Eig::eval_cpu] Eigenvalue decomposition failed with error code "
+            << info;
+        throw std::runtime_error(msg.str());
+      }
+    }
+  });
+  encoder.add_temporary(a);
+}
+
+} // namespace
+
+void Eig::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  const auto& a = inputs[0];
+  auto& values = outputs[0];
+
+  auto vectors = compute_eigenvectors_
+      ? outputs[1]
+      : array(a.shape(), complex64, nullptr, {});
+
+  auto a_copy = array(a.shape(), a.dtype(), nullptr, {});
+  copy(
+      a,
+      a_copy,
+      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
+      stream());
+
+  values.set_data(allocator::malloc(values.nbytes()));
+
+  if (compute_eigenvectors_) {
+    // Set the strides and flags so the eigenvectors
+    // are in the columns of the output
+    auto flags = vectors.flags();
+    auto strides = vectors.strides();
+    auto ndim = a.ndim();
+    std::swap(strides[ndim - 1], strides[ndim - 2]);
+
+    if (a.size() > 1) {
+      flags.row_contiguous = false;
+      if (ndim > 2) {
+        flags.col_contiguous = false;
+      } else {
+        flags.col_contiguous = true;
+      }
+    }
+    vectors.set_data(
+        allocator::malloc(vectors.nbytes()), vectors.size(), strides, flags);
+  }
+  switch (a.dtype()) {
+    case float32:
+      eig_impl<float>(a_copy, vectors, values, compute_eigenvectors_, stream());
+      break;
+    default:
+      throw std::runtime_error("[Eig::eval_cpu] only supports float32.");
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/eigh.cpp
+++ b/mlx/backend/cpu/eigh.cpp
@ -12,6 +12,133 @@ namespace mlx::core {

 namespace {

+template <typename T, class Enable = void>
+struct EighWork {};
+
+template <typename T>
+struct EighWork<
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  using R = T;
+
+  char jobz;
+  char uplo;
+  int N;
+  int lwork;
+  int liwork;
+  int info;
+  std::vector<array::Data> buffers;
+
+  EighWork(char jobz_, char uplo_, int N_)
+      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), liwork(-1) {
+    T work;
+    int iwork;
+    syevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        nullptr,
+        &N,
+        nullptr,
+        &work,
+        &lwork,
+        &iwork,
+        &liwork,
+        &info);
+    lwork = static_cast<int>(work);
+    liwork = iwork;
+    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
+    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
+  }
+
+  void run(T* vectors, T* values) {
+    syevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        vectors,
+        &N,
+        values,
+        static_cast<T*>(buffers[0].buffer.raw_ptr()),
+        &lwork,
+        static_cast<int*>(buffers[1].buffer.raw_ptr()),
+        &liwork,
+        &info);
+  }
+};
+
+template <>
+struct EighWork<std::complex<float>> {
+  using T = std::complex<float>;
+  using R = float;
+
+  char jobz;
+  char uplo;
+  int N;
+  int lwork;
+  int lrwork;
+  int liwork;
+  int info;
+  std::vector<array::Data> buffers;
+
+  EighWork(char jobz_, char uplo_, int N_)
+      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), lrwork(-1), liwork(-1) {
+    T work;
+    R rwork;
+    int iwork;
+    heevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        nullptr,
+        &N,
+        nullptr,
+        &work,
+        &lwork,
+        &rwork,
+        &lrwork,
+        &iwork,
+        &liwork,
+        &info);
+    lwork = static_cast<int>(work.real());
+    lrwork = static_cast<int>(rwork);
+    liwork = iwork;
+    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
+    buffers.emplace_back(allocator::malloc(sizeof(R) * lrwork));
+    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
+  }
+
+  void run(T* vectors, R* values) {
+    heevd<T>(
+        &jobz,
+        &uplo,
+        &N,
+        vectors,
+        &N,
+        values,
+        static_cast<T*>(buffers[0].buffer.raw_ptr()),
+        &lwork,
+        static_cast<R*>(buffers[1].buffer.raw_ptr()),
+        &lrwork,
+        static_cast<int*>(buffers[2].buffer.raw_ptr()),
+        &liwork,
+        &info);
+    if (jobz == 'V') {
+      // We have pre-transposed the vectors but we also must conjugate them
+      // when they are complex.
+      //
+      // We could vectorize this but it is so fast in comparison to heevd that
+      // it doesn't really matter.
+      for (int i = 0; i < N; i++) {
+        for (int j = 0; j < N; j++) {
+          *vectors = std::conj(*vectors);
+          vectors++;
+        }
+      }
+    }
+  }
+};
+
 template <typename T>
 void eigh_impl(
    array& vectors,
@ -19,8 +146,10 @@ void eigh_impl(
    const std::string& uplo,
    bool compute_eigenvectors,
    Stream stream) {
+  using R = typename EighWork<T>::R;
+
  auto vec_ptr = vectors.data<T>();
-  auto eig_ptr = values.data<T>();
+  auto eig_ptr = values.data<R>();
  char jobz = compute_eigenvectors ? 'V' : 'N';

  auto& encoder = cpu::get_command_encoder(stream);
@ -33,49 +162,17 @@ void eigh_impl(
                    N = vectors.shape(-1),
                    size = vectors.size()]() mutable {
    // Work query
-    int lwork = -1;
-    int liwork = -1;
-    int info;
-    {
-      T work;
-      int iwork;
-      syevd<T>(
-          &jobz,
-          &uplo,
-          &N,
-          nullptr,
-          &N,
-          nullptr,
-          &work,
-          &lwork,
-          &iwork,
-          &liwork,
-          &info);
-      lwork = static_cast<int>(work);
-      liwork = iwork;
-    }
+    EighWork<T> work(jobz, uplo, N);

-    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
-    auto iwork_buf = array::Data{allocator::malloc(sizeof(int) * liwork)};
+    // Work loop
    for (size_t i = 0; i < size / (N * N); ++i) {
-      syevd<T>(
-          &jobz,
-          &uplo,
-          &N,
-          vec_ptr,
-          &N,
-          eig_ptr,
-          static_cast<T*>(work_buf.buffer.raw_ptr()),
-          &lwork,
-          static_cast<int*>(iwork_buf.buffer.raw_ptr()),
-          &liwork,
-          &info);
+      work.run(vec_ptr, eig_ptr);
      vec_ptr += N * N;
      eig_ptr += N;
-      if (info != 0) {
+      if (work.info != 0) {
        std::stringstream msg;
        msg << "[Eigh::eval_cpu] Eigenvalue decomposition failed with error code "
-            << info;
+            << work.info;
        throw std::runtime_error(msg.str());
      }
    }
@ -131,6 +228,10 @@ void Eigh::eval_cpu(
      eigh_impl<double>(
          vectors, values, uplo_, compute_eigenvectors_, stream());
      break;
+    case complex64:
+      eigh_impl<std::complex<float>>(
+          vectors, values, uplo_, compute_eigenvectors_, stream());
+      break;
    default:
      throw std::runtime_error(
          "[Eigh::eval_cpu] only supports float32 or float64.");
--- a/mlx/backend/cpu/gemms/no_bf16.cpp
+++ b/mlx/backend/cpu/gemms/no_bf16.cpp
@ -1,27 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cpu/gemm.h"
-
-namespace mlx::core {
-
-template <>
-void matmul<bfloat16_t>(
-    const bfloat16_t*,
-    const bfloat16_t*,
-    bfloat16_t*,
-    bool,
-    bool,
-    size_t,
-    size_t,
-    size_t,
-    float,
-    float,
-    size_t,
-    const Shape&,
-    const Strides&,
-    const Shape&,
-    const Strides&) {
-  throw std::runtime_error("[Matmul::eval_cpu] bfloat16 not supported.");
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/no_fp16.cpp
+++ b/mlx/backend/cpu/gemms/no_fp16.cpp
@ -1,27 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cpu/gemm.h"
-
-namespace mlx::core {
-
-template <>
-void matmul<float16_t>(
-    const float16_t*,
-    const float16_t*,
-    float16_t*,
-    bool,
-    bool,
-    size_t,
-    size_t,
-    size_t,
-    float,
-    float,
-    size_t,
-    const Shape&,
-    const Strides&,
-    const Shape&,
-    const Strides&) {
-  throw std::runtime_error("[Matmul::eval_cpu] float16 not supported.");
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/simd_bf16.cpp
+++ b/mlx/backend/cpu/gemms/simd_bf16.cpp
@ -0,0 +1,45 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cpu/gemm.h"
+#include "mlx/backend/cpu/gemms/simd_gemm.h"
+
+namespace mlx::core {
+
+template <>
+void matmul<bfloat16_t>(
+    const bfloat16_t* a,
+    const bfloat16_t* b,
+    bfloat16_t* out,
+    bool a_transposed,
+    bool b_transposed,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    float alpha,
+    float beta,
+    size_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  auto ndim = a_shape.size();
+  size_t M = a_shape[ndim - 2];
+  size_t N = b_shape[ndim - 1];
+  size_t K = a_shape[ndim - 1];
+  for (int i = 0; i < batch_size; ++i) {
+    simd_gemm<bfloat16_t, float>(
+        a + elem_to_loc(M * K * i, a_shape, a_strides),
+        b + elem_to_loc(K * N * i, b_shape, b_strides),
+        out + M * N * i,
+        a_transposed,
+        b_transposed,
+        M,
+        N,
+        K,
+        alpha,
+        beta);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/simd_fp16.cpp
+++ b/mlx/backend/cpu/gemms/simd_fp16.cpp
@ -0,0 +1,45 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cpu/gemm.h"
+#include "mlx/backend/cpu/gemms/simd_gemm.h"
+
+namespace mlx::core {
+
+template <>
+void matmul<float16_t>(
+    const float16_t* a,
+    const float16_t* b,
+    float16_t* out,
+    bool a_transposed,
+    bool b_transposed,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    float alpha,
+    float beta,
+    size_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  auto ndim = a_shape.size();
+  size_t M = a_shape[ndim - 2];
+  size_t N = b_shape[ndim - 1];
+  size_t K = a_shape[ndim - 1];
+  for (int i = 0; i < batch_size; ++i) {
+    simd_gemm<float16_t, float>(
+        a + elem_to_loc(M * K * i, a_shape, a_strides),
+        b + elem_to_loc(K * N * i, b_shape, b_strides),
+        out + M * N * i,
+        a_transposed,
+        b_transposed,
+        M,
+        N,
+        K,
+        alpha,
+        beta);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/simd_gemm.h
+++ b/mlx/backend/cpu/gemms/simd_gemm.h
@ -0,0 +1,139 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+
+#include "mlx/backend/cpu/simd/simd.h"
+
+namespace mlx::core {
+
+inline int ceildiv(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+template <int block_size, typename T, typename AccT>
+void load_block(
+    const T* in,
+    AccT* out,
+    int M,
+    int N,
+    int i,
+    int j,
+    bool transpose) {
+  if (transpose) {
+    for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
+      for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
+        out[jj * block_size + ii] =
+            in[(i * block_size + ii) * N + j * block_size + jj];
+      }
+    }
+  } else {
+    for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
+      for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
+        out[ii * block_size + jj] =
+            in[(i * block_size + ii) * N + j * block_size + jj];
+      }
+    }
+  }
+}
+
+template <typename T, typename AccT>
+void simd_gemm(
+    const T* a,
+    const T* b,
+    T* c,
+    bool a_trans,
+    bool b_trans,
+    int M,
+    int N,
+    int K,
+    float alpha,
+    float beta) {
+  constexpr int block_size = 16;
+  constexpr int simd_size = simd::max_size<AccT>;
+  static_assert(
+      (block_size % simd_size) == 0,
+      "Block size must be divisible by SIMD size");
+
+  int last_k_block_size = K - block_size * (K / block_size);
+  int last_k_simd_block = (last_k_block_size / simd_size) * simd_size;
+  for (int i = 0; i < ceildiv(M, block_size); i++) {
+    for (int j = 0; j < ceildiv(N, block_size); j++) {
+      AccT c_block[block_size * block_size] = {0.0};
+      AccT a_block[block_size * block_size];
+      AccT b_block[block_size * block_size];
+
+      int k = 0;
+      for (; k < K / block_size; k++) {
+        // Load a and b blocks
+        if (a_trans) {
+          load_block<block_size>(a, a_block, K, M, k, i, true);
+        } else {
+          load_block<block_size>(a, a_block, M, K, i, k, false);
+        }
+        if (b_trans) {
+          load_block<block_size>(b, b_block, N, K, j, k, false);
+        } else {
+          load_block<block_size>(b, b_block, K, N, k, j, true);
+        }
+
+        // Multiply and accumulate
+        for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
+          for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
+            for (int kk = 0; kk < block_size; kk += simd_size) {
+              auto av =
+                  simd::load<AccT, simd_size>(a_block + ii * block_size + kk);
+              auto bv =
+                  simd::load<AccT, simd_size>(b_block + jj * block_size + kk);
+              c_block[ii * block_size + jj] += simd::sum(av * bv);
+            }
+          }
+        }
+      }
+      if (last_k_block_size) {
+        // Load a and b blocks
+        if (a_trans) {
+          load_block<block_size>(a, a_block, K, M, k, i, true);
+        } else {
+          load_block<block_size>(a, a_block, M, K, i, k, false);
+        }
+        if (b_trans) {
+          load_block<block_size>(b, b_block, N, K, j, k, false);
+        } else {
+          load_block<block_size>(b, b_block, K, N, k, j, true);
+        }
+
+        // Multiply and accumulate
+        for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
+          for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
+            int kk = 0;
+            for (; kk < last_k_simd_block; kk += simd_size) {
+              auto av =
+                  simd::load<AccT, simd_size>(a_block + ii * block_size + kk);
+              auto bv =
+                  simd::load<AccT, simd_size>(b_block + jj * block_size + kk);
+              c_block[ii * block_size + jj] += simd::sum(av * bv);
+            }
+            for (; kk < last_k_block_size; ++kk) {
+              c_block[ii * block_size + jj] +=
+                  a_block[ii * block_size + kk] * b_block[jj * block_size + kk];
+            }
+          }
+        }
+      }
+
+      // Store
+      for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
+        for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
+          auto c_idx = (i * block_size + ii) * N + j * block_size + jj;
+          if (beta != 0) {
+            c[c_idx] = static_cast<T>(
+                alpha * c_block[ii * block_size + jj] + beta * c[c_idx]);
+          } else {
+            c[c_idx] = static_cast<T>(alpha * c_block[ii * block_size + jj]);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cpu/indexing.cpp
+++ b/mlx/backend/cpu/indexing.cpp
@ -257,15 +257,11 @@ void gather_axis(
    const array& ind,
    array& out,
    const int axis) {
-  auto strides = ind.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = ind.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator ind_it(shape, strides, src.ndim() - 1);
-
-  strides = src.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator src_it(shape, strides, src.ndim() - 1);
+  auto shape = remove_index(ind.shape(), axis);
+  ContiguousIterator ind_it(
+      shape, remove_index(ind.strides(), axis), src.ndim() - 1);
+  ContiguousIterator src_it(
+      shape, remove_index(src.strides(), axis), src.ndim() - 1);

  auto ind_ptr = ind.data<IdxT>();
  auto src_ptr = src.data<T>();
@ -585,15 +581,11 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {

 template <typename T, typename IdxT, typename OpT>
 void scatter_axis(array& out, const array idx, const array& upd, int axis) {
-  auto strides = idx.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = idx.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator idx_it(shape, strides, upd.ndim() - 1);
-
-  strides = upd.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator upd_it(shape, strides, upd.ndim() - 1);
+  auto shape = remove_index(idx.shape(), axis);
+  ContiguousIterator idx_it(
+      shape, remove_index(idx.strides(), axis), upd.ndim() - 1);
+  ContiguousIterator upd_it(
+      shape, remove_index(upd.strides(), axis), upd.ndim() - 1);

  auto idx_ptr = idx.data<IdxT>();
  auto upd_ptr = upd.data<T>();
--- a/mlx/backend/cpu/lapack.h
+++ b/mlx/backend/cpu/lapack.h
@ -2,14 +2,14 @@

 #pragma once

-// Required for Visual Studio.
-// https://github.com/OpenMathLib/OpenBLAS/blob/develop/docs/install.md
-#ifdef _MSC_VER
 #include <complex>
 #define LAPACK_COMPLEX_CUSTOM
 #define lapack_complex_float std::complex<float>
 #define lapack_complex_double std::complex<double>
-#endif
+#define lapack_complex_float_real(z) ((z).real())
+#define lapack_complex_float_imag(z) ((z).imag())
+#define lapack_complex_double_real(z) ((z).real())
+#define lapack_complex_double_imag(z) ((z).imag())

 #ifdef MLX_USE_ACCELERATE
 #include <Accelerate/Accelerate.h>
@ -32,7 +32,7 @@

 #endif

-#define INSTANTIATE_LAPACK_TYPES(FUNC)                       \
+#define INSTANTIATE_LAPACK_REAL(FUNC)                        \
  template <typename T, typename... Args>                    \
  void FUNC(Args... args) {                                  \
    if constexpr (std::is_same_v<T, float>) {                \
@ -42,11 +42,24 @@
    }                                                        \
  }

-INSTANTIATE_LAPACK_TYPES(geqrf)
-INSTANTIATE_LAPACK_TYPES(orgqr)
-INSTANTIATE_LAPACK_TYPES(syevd)
-INSTANTIATE_LAPACK_TYPES(potrf)
-INSTANTIATE_LAPACK_TYPES(gesvdx)
-INSTANTIATE_LAPACK_TYPES(getrf)
-INSTANTIATE_LAPACK_TYPES(getri)
-INSTANTIATE_LAPACK_TYPES(trtri)
+INSTANTIATE_LAPACK_REAL(geqrf)
+INSTANTIATE_LAPACK_REAL(orgqr)
+INSTANTIATE_LAPACK_REAL(syevd)
+INSTANTIATE_LAPACK_REAL(geev)
+INSTANTIATE_LAPACK_REAL(potrf)
+INSTANTIATE_LAPACK_REAL(gesvdx)
+INSTANTIATE_LAPACK_REAL(getrf)
+INSTANTIATE_LAPACK_REAL(getri)
+INSTANTIATE_LAPACK_REAL(trtri)
+
+#define INSTANTIATE_LAPACK_COMPLEX(FUNC)                            \
+  template <typename T, typename... Args>                           \
+  void FUNC(Args... args) {                                         \
+    if constexpr (std::is_same_v<T, std::complex<float>>) {         \
+      MLX_LAPACK_FUNC(c##FUNC)(std::forward<Args>(args)...);        \
+    } else if constexpr (std::is_same_v<T, std::complex<double>>) { \
+      MLX_LAPACK_FUNC(z##FUNC)(std::forward<Args>(args)...);        \
+    }                                                               \
+  }
+
+INSTANTIATE_LAPACK_COMPLEX(heevd)
--- a/mlx/backend/cpu/matmul.cpp
+++ b/mlx/backend/cpu/matmul.cpp
@ -132,6 +132,10 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
    throw std::runtime_error(
        "[AddMM::eval_cpu] Currently only supports float32.");
  }
+  if (out.size() == 0) {
+    out.set_data(allocator::malloc(out.nbytes()));
+    return;
+  }

  // Fill output with C
  auto& c = inputs[2];
@ -139,7 +143,9 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
      ? CopyType::Scalar
      : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  copy(c, out, ctype, stream());
-
+  if (inputs[0].shape(-1) == 0) {
+    return;
+  }
  matmul_general(inputs[0], inputs[1], out, stream(), alpha_, beta_);
 }

--- a/mlx/backend/cpu/quantized.cpp
+++ b/mlx/backend/cpu/quantized.cpp
@ -13,9 +13,18 @@ namespace mlx::core {

 namespace {

+inline constexpr short get_pack_factor(int bits, int wsize = 8) {
+  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
+}
+
+inline constexpr short get_bytes_per_pack(int bits, int wsize = 8) {
+  auto power_of_2_bits = (bits & (bits - 1)) == 0;
+  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
+}
+
 template <typename T, int bits>
 void extract_bits(const uint8_t* w_in, T* w_out) {
-  assert(bits == 3 || bits == 6);
+  static_assert(bits == 3 || bits == 5 || bits == 6);
  if (bits == 3) {
    w_out[0] = static_cast<T>(w_in[0] & 0x7);
    w_out[1] = static_cast<T>((w_in[0] & 0x38) >> 3);
@ -25,6 +34,16 @@ void extract_bits(const uint8_t* w_in, T* w_out) {
    w_out[5] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0x3) << 1));
    w_out[6] = static_cast<T>((w_in[2] & 0x1c) >> 2);
    w_out[7] = static_cast<T>((w_in[2] & 0xe0) >> 5);
+  } else if (bits == 5) {
+    w_out[0] = static_cast<T>(w_in[0] & 0x1f);
+    w_out[1] = static_cast<T>(((w_in[0] & 0xe0) >> 5) + ((w_in[1] & 0x3) << 3));
+    w_out[2] = static_cast<T>((w_in[1] & 0x7c) >> 2);
+    w_out[3] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0xf) << 1));
+    w_out[4] = static_cast<T>(((w_in[2] & 0xf0) >> 4) + ((w_in[3] & 0x1) << 4));
+    w_out[5] = static_cast<T>((w_in[3] & 0x3e) >> 1);
+    w_out[6] = static_cast<T>(((w_in[3] & 0xc0) >> 6) + ((w_in[4] & 0x7) << 2));
+    w_out[7] = static_cast<T>((w_in[4] & 0xf8) >> 3);
+
  } else if (bits == 6) {
    w_out[0] = static_cast<T>(w_in[0] & 0x3f);
    w_out[1] =
@ -46,8 +65,8 @@ void _qmm(
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
-  constexpr int pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
-  constexpr int bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;
+  constexpr int pack_factor = get_pack_factor(bits, 8);
+  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
@ -65,7 +84,7 @@ void _qmm(
        T scale = *scales_local++;
        T bias = *biases_local++;
        for (int ng = 0; ng < packs_in_group; ng++) {
-          if (bits == 3 || bits == 6) {
+          if constexpr (bits == 3 || bits == 5 || bits == 6) {
            T wl[pack_factor];
            extract_bits<T, bits>(w_local, wl);
 #pragma clang loop unroll(full)
@ -104,8 +123,9 @@ void _qmm_t(
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
-  constexpr int pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
-  constexpr int bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;
+
+  constexpr int pack_factor = get_pack_factor(bits, 8);
+  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
@ -121,7 +141,7 @@ void _qmm_t(
        T bias = *biases_local++;

        for (int kw = 0; kw < packs_in_group; kw++) {
-          if (bits == 3 || bits == 6) {
+          if constexpr (bits == 3 || bits == 5 || bits == 6) {
            T wl[pack_factor];
            extract_bits<T, bits>(w_local, wl);
 #pragma clang loop unroll(full)
@ -304,6 +324,10 @@ void _qmm_dispatch_typed(
      _qmm_dispatch_group<T, 4>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
      break;
+    case 5:
+      _qmm_dispatch_group<T, 5>(
+          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
+      break;
    case 6:
      _qmm_dispatch_group<T, 6>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
@ -613,9 +637,8 @@ void quantize(
  float eps = 1e-7;

  bool power_of_2_bits = is_power_of_2(bits);
-  int el_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;
-  // For 3/6 bits we read 3 uint8s at a time instead of 1 uint32
-  int bytes_per_pack = power_of_2_bits ? 1 : 3;
+  int el_per_int = get_pack_factor(bits, 32);
+  int bytes_per_pack = get_bytes_per_pack(bits);
  int int_per_group = group_size * bytes_per_pack / el_per_int;
  size_t n_groups = w_size / group_size;

@ -640,15 +663,21 @@ void quantize(
    }
    size_t out_idx = i * int_per_group;
    for (int j = 0; j < int_per_group / bytes_per_pack; ++j) {
-      uint32_t out_el = 0;
+      uint64_t out_el = 0;
      for (int k = 0; k < el_per_int; ++k) {
        float w_el = w[w_idx + j * el_per_int + k];
        w_el = std::rint((w_el - bias) / scale);
        w_el = std::min(std::max(w_el, 0.0f), n_bins);
-        out_el |= static_cast<uint32_t>(w_el) << (k * bits);
+        out_el |= static_cast<uint64_t>(w_el) << (k * bits);
      }
      if (power_of_2_bits) {
        out[out_idx + j] = out_el;
+      } else if (bits == 5) {
+        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
+        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
+        out[out_idx + bytes_per_pack * j + 2] = (out_el & 0xff0000) >> 16;
+        out[out_idx + bytes_per_pack * j + 3] = (out_el & 0xff000000) >> 24;
+        out[out_idx + bytes_per_pack * j + 4] = (out_el & 0xff00000000) >> 32;
      } else {
        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
--- a/mlx/backend/cpu/scan.cpp
+++ b/mlx/backend/cpu/scan.cpp
@ -3,6 +3,7 @@
 #include <cassert>

 #include "mlx/backend/common/utils.h"
+#include "mlx/backend/cpu/binary_ops.h"
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"
@ -226,6 +227,16 @@ void scan_dispatch(
      scan_op<T, U>(in, out, axis, reverse, inclusive, op, init);
      break;
    }
+    case Scan::LogAddExp: {
+      auto op = [](U a, T b) {
+        return detail::LogAddExp{}(a, static_cast<U>(b));
+      };
+      auto init = (issubdtype(in.dtype(), floating))
+          ? static_cast<U>(-std::numeric_limits<float>::infinity())
+          : std::numeric_limits<U>::min();
+      scan_op<T, U>(in, out, axis, reverse, inclusive, op, init);
+      break;
+    }
  }
 }

@ -319,7 +330,8 @@ void Scan::eval_cpu(const std::vector<array>& inputs, array& out) {
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case complex64:
-        throw std::runtime_error("Scan ops do not support complex types yet");
+        scan_dispatch<complex64_t, complex64_t>(
+            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
    }
  });
--- a/mlx/backend/cpu/simd/base_simd.h
+++ b/mlx/backend/cpu/simd/base_simd.h
@ -88,12 +88,33 @@ DEFAULT_UNARY(expm1, std::expm1)
 DEFAULT_UNARY(floor, std::floor)
 DEFAULT_UNARY(log, std::log)
 DEFAULT_UNARY(log10, std::log10)
-DEFAULT_UNARY(log1p, std::log1p)
 DEFAULT_UNARY(sinh, std::sinh)
 DEFAULT_UNARY(sqrt, std::sqrt)
 DEFAULT_UNARY(tan, std::tan)
 DEFAULT_UNARY(tanh, std::tanh)

+template <typename T>
+Simd<T, 1> log1p(Simd<T, 1> in) {
+  if constexpr (is_complex<T>) {
+    auto x = in.value.real();
+    auto y = in.value.imag();
+    auto zabs = std::abs(in.value);
+    auto theta = std::atan2(y, x + 1);
+    if (zabs < 0.5) {
+      auto r = x * (2 + x) + y * y;
+      if (r == 0) { // handle underflow
+        return Simd<T, 1>{T{x, theta}};
+      }
+      return Simd<T, 1>{T{((typeof(x))(0.5)) * std::log1p(r), theta}};
+    } else {
+      auto z0 = std::hypot(x + 1, y);
+      return Simd<T, 1>{T{std::log(z0), theta}};
+    }
+  } else {
+    return Simd<T, 1>{std::log1p(in.value)};
+  }
+}
+
 template <typename T>
 Simd<T, 1> log2(Simd<T, 1> in) {
  if constexpr (is_complex<T>) {
--- a/mlx/backend/cpu/unary.cpp
+++ b/mlx/backend/cpu/unary.cpp
@ -1,5 +1,8 @@
 // Copyright © 2024 Apple Inc.

+// Required for using M_LN2 in MSVC.
+#define _USE_MATH_DEFINES
+
 #include <cassert>

 #include "mlx/backend/cpu/unary.h"
--- a/mlx/backend/cpu/unary.h
+++ b/mlx/backend/cpu/unary.h
@ -2,32 +2,13 @@

 #pragma once

-#include "mlx/allocator.h"
-#include "mlx/array.h"
-#include "mlx/backend/common/utils.h"
+#include "mlx/backend/common/unary.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"
 #include "mlx/utils.h"

 namespace mlx::core {

-void set_unary_output_data(const array& in, array& out) {
-  if (in.flags().contiguous) {
-    if (is_donatable(in, out)) {
-      out.copy_shared_buffer(in);
-    } else {
-      auto size = in.data_size();
-      out.set_data(
-          allocator::malloc(size * out.itemsize()),
-          size,
-          in.strides(),
-          in.flags());
-    }
-  } else {
-    out.set_data(allocator::malloc(out.nbytes()));
-  }
-}
-
 template <typename T, typename U = T, typename Op>
 void unary_op(const T* a, U* out, size_t shape, size_t stride) {
  for (size_t i = 0; i < shape; i += 1) {
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@ -0,0 +1,120 @@
+# Filename rules in cuda backend:
+#
+# * Use .cu/.cuh if code contains device code, and .cpp/.h if not.
+# * Device-only code should be put in device/ subdir.
+# * Files in device/ subdir should not include files outside.
+target_sources(
+  mlx
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_contiguous.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_dynamic.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_input.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/cuda.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/jit_module.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/layer_norm.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/random.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/col_reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/row_reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/segmented_reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
+
+target_compile_definitions(mlx PRIVATE MLX_USE_CUDA)
+
+# Embed kernel sources in binary for JIT compilation.
+file(
+  GLOB MLX_JIT_SOURCES
+  RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+  "${CMAKE_CURRENT_SOURCE_DIR}/device/*.h"
+  "${CMAKE_CURRENT_SOURCE_DIR}/device/*.cuh")
+string(JOIN ":" MLX_JIT_SOURCES_ARG ${MLX_JIT_SOURCES})
+add_custom_command(
+  OUTPUT gen/cuda_jit_sources.h
+  COMMAND
+    ${CMAKE_COMMAND} -DMLX_SOURCE_ROOT=${CMAKE_CURRENT_SOURCE_DIR}
+    -DMLX_JIT_SOURCES=${MLX_JIT_SOURCES_ARG} -P
+    "${CMAKE_CURRENT_SOURCE_DIR}/bin2h.cmake"
+  DEPENDS bin2h.cmake ${MLX_JIT_SOURCES})
+add_custom_target(cuda_jit_sources DEPENDS gen/cuda_jit_sources.h)
+add_dependencies(mlx cuda_jit_sources)
+target_include_directories(mlx PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/gen")
+
+# Enable defining device lambda functions.
+target_compile_options(mlx
+                       PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>")
+
+# CUDA 12.8 emits warning #20280-D for copy kernels which is a false positive.
+# Explicitly pass this flag to suppress the warning, it is safe to set it to
+# true but the warning wouldn't be suppressed.
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
+  target_compile_options(
+    mlx
+    PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
+endif()
+
+# Suppress warning when building for compute capability 7 used by V100.
+target_compile_options(
+  mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--Wno-deprecated-gpu-targets>")
+
+# Compute capability 7 is required for synchronization between CPU/GPU with
+# managed memory. TODO: Add more architectures for potential performance gain.
+set(MLX_CUDA_ARCHITECTURES
+    "70;80"
+    CACHE STRING "CUDA architectures")
+message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
+set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
+                                     "${MLX_CUDA_ARCHITECTURES}")
+
+# Use fixed version of CCCL.
+FetchContent_Declare(
+  cccl
+  URL "https://github.com/NVIDIA/cccl/releases/download/v2.8.1/cccl-v2.8.1.zip")
+FetchContent_MakeAvailable(cccl)
+target_include_directories(mlx BEFORE PRIVATE "${cccl_SOURCE_DIR}/include")
+
+# Use fixed version of NVTX.
+FetchContent_Declare(
+  nvtx3
+  GIT_REPOSITORY https://github.com/NVIDIA/NVTX.git
+  GIT_TAG v3.1.1
+  GIT_SHALLOW TRUE
+  SOURCE_SUBDIR c EXCLUDE_FROM_ALL)
+FetchContent_MakeAvailable(nvtx3)
+target_link_libraries(mlx PUBLIC $<BUILD_INTERFACE:nvtx3-cpp>)
+
+# Make cuda runtime APIs available in non-cuda files.
+find_package(CUDAToolkit REQUIRED)
+target_include_directories(mlx PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+
+# Use cublasLt.
+target_link_libraries(mlx PRIVATE CUDA::cublasLt)
+
+# Use NVRTC and driver APIs.
+target_link_libraries(mlx PRIVATE CUDA::nvrtc CUDA::cuda_driver)
+
+# Suppress nvcc warnings on MLX headers.
+target_compile_options(mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe
+                                   --diag_suppress=997>)
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@ -0,0 +1,215 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/allocator.h"
+#include "mlx/backend/cuda/utils.h"
+#include "mlx/backend/cuda/worker.h"
+#include "mlx/utils.h"
+
+#include <cuda_runtime.h>
+#include <fmt/format.h>
+#include <unistd.h>
+
+#include <cassert>
+
+namespace mlx::core {
+
+namespace cu {
+
+constexpr int page_size = 16384;
+
+CudaAllocator::CudaAllocator()
+    : buffer_cache_(
+          page_size,
+          [](CudaBuffer* buf) { return buf->size; },
+          [this](CudaBuffer* buf) {
+            cuda_free(buf->data);
+            delete buf;
+          }) {
+  // TODO: Set memory limit for multi-device.
+  size_t free, total;
+  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
+  memory_limit_ = total * 0.8;
+  max_pool_size_ = memory_limit_;
+}
+
+Buffer CudaAllocator::malloc(size_t size) {
+  // Find available buffer from cache.
+  auto orig_size = size;
+  std::unique_lock lock(mutex_);
+  if (size < page_size) {
+    size = next_power_of_2(size);
+  } else {
+    size = page_size * ((size + page_size - 1) / page_size);
+  }
+
+  CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
+  if (!buf) {
+    // If we have a lot of memory pressure or are over the maximum cache size,
+    // try to reclaim memory from the cache.
+    size_t mem_required = get_active_memory() + get_cache_memory() + size;
+    if (mem_required >= memory_limit_) {
+      buffer_cache_.release_cached_buffers(mem_required - memory_limit_);
+    }
+
+    lock.unlock();
+    buf = new CudaBuffer{nullptr, size};
+    cudaError_t err = cudaMallocManaged(&buf->data, size);
+    if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
+      throw std::runtime_error(fmt::format(
+          "cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
+    }
+    lock.lock();
+  }
+  active_memory_ += size;
+  peak_memory_ = std::max(active_memory_, peak_memory_);
+
+  // Maintain the cache below the requested limit.
+  if (get_cache_memory() > max_pool_size_) {
+    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
+  }
+
+  return Buffer{buf};
+}
+
+void CudaAllocator::free(Buffer buffer) {
+  auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
+  if (!buf) {
+    return;
+  }
+
+  std::unique_lock lock(mutex_);
+  active_memory_ -= buf->size;
+  if (get_cache_memory() < max_pool_size_) {
+    buffer_cache_.recycle_to_cache(buf);
+  } else {
+    lock.unlock();
+    cuda_free(buf->data);
+    delete buf;
+  }
+}
+
+size_t CudaAllocator::size(Buffer buffer) const {
+  auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
+  if (!buf) {
+    return 0;
+  }
+  return buf->size;
+}
+
+void CudaAllocator::register_this_thread() {
+  std::lock_guard lock(worker_mutex_);
+  allowed_threads_.insert(std::this_thread::get_id());
+}
+
+void CudaAllocator::cuda_free(void* buf) {
+  // If cuda_free() is called from a unregistered thread, reschedule the call to
+  // worker.
+  {
+    std::lock_guard lock(worker_mutex_);
+    if (allowed_threads_.count(std::this_thread::get_id()) == 0) {
+      if (!worker_) {
+        worker_.reset(new Worker);
+      }
+      worker_->add_task([this, buf]() { this->cuda_free(buf); });
+      worker_->end_batch();
+      worker_->commit();
+      return;
+    }
+  }
+  cudaFree(buf);
+}
+
+size_t CudaAllocator::get_active_memory() const {
+  return active_memory_;
+}
+
+size_t CudaAllocator::get_peak_memory() const {
+  return peak_memory_;
+}
+
+void CudaAllocator::reset_peak_memory() {
+  std::lock_guard lock(mutex_);
+  peak_memory_ = 0;
+}
+
+size_t CudaAllocator::get_memory_limit() {
+  return memory_limit_;
+}
+
+size_t CudaAllocator::set_memory_limit(size_t limit) {
+  std::lock_guard lock(mutex_);
+  std::swap(limit, memory_limit_);
+  return limit;
+}
+
+size_t CudaAllocator::get_cache_memory() const {
+  return buffer_cache_.cache_size();
+}
+
+size_t CudaAllocator::set_cache_limit(size_t limit) {
+  std::lock_guard lk(mutex_);
+  std::swap(limit, max_pool_size_);
+  return limit;
+}
+
+void CudaAllocator::clear_cache() {
+  std::lock_guard lk(mutex_);
+  buffer_cache_.clear();
+}
+
+CudaAllocator& allocator() {
+  // By creating the |allocator_| on heap, the destructor of CudaAllocator
+  // will not be called on exit and buffers in the cache will be leaked. This
+  // can save some time at program exit.
+  static CudaAllocator* allocator_ = new CudaAllocator;
+  return *allocator_;
+}
+
+} // namespace cu
+
+namespace allocator {
+
+Allocator& allocator() {
+  return cu::allocator();
+}
+
+void* Buffer::raw_ptr() {
+  if (!ptr_) {
+    return nullptr;
+  }
+  return static_cast<cu::CudaBuffer*>(ptr_)->data;
+}
+
+} // namespace allocator
+
+size_t get_active_memory() {
+  return cu::allocator().get_active_memory();
+}
+size_t get_peak_memory() {
+  return cu::allocator().get_peak_memory();
+}
+void reset_peak_memory() {
+  return cu::allocator().reset_peak_memory();
+}
+size_t set_memory_limit(size_t limit) {
+  return cu::allocator().set_memory_limit(limit);
+}
+size_t get_memory_limit() {
+  return cu::allocator().get_memory_limit();
+}
+size_t get_cache_memory() {
+  return cu::allocator().get_cache_memory();
+}
+size_t set_cache_limit(size_t limit) {
+  return cu::allocator().set_cache_limit(limit);
+}
+void clear_cache() {
+  cu::allocator().clear_cache();
+}
+
+// Not supported in CUDA.
+size_t set_wired_limit(size_t) {
+  return 0;
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/allocator.h
+++ b/mlx/backend/cuda/allocator.h
@ -0,0 +1,67 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/buffer_cache.h"
+
+#include <mutex>
+#include <set>
+#include <thread>
+#include <utility>
+
+namespace mlx::core::cu {
+
+class Worker;
+
+using allocator::Buffer;
+
+// Stores cuda-managed unified memory.
+struct CudaBuffer {
+  void* data;
+  size_t size;
+};
+
+class CudaAllocator : public allocator::Allocator {
+ public:
+  Buffer malloc(size_t size) override;
+  void free(Buffer buffer) override;
+  size_t size(Buffer buffer) const override;
+
+  // Register current thread as safe to free buffers.
+  // In cuda freeing a buffer implicitly synchronizes stream, and for threads
+  // that may be waited by gpu stream (for example cpu stream threads), freeing
+  // buffers there would result in dead lock.
+  void register_this_thread();
+
+  // Call cudaFree in the safe thread.
+  void cuda_free(void* buf);
+
+  size_t get_active_memory() const;
+  size_t get_peak_memory() const;
+  void reset_peak_memory();
+  size_t get_memory_limit();
+  size_t set_memory_limit(size_t limit);
+  size_t get_cache_memory() const;
+  size_t set_cache_limit(size_t limit);
+  void clear_cache();
+
+ private:
+  CudaAllocator();
+  friend CudaAllocator& allocator();
+
+  std::mutex worker_mutex_;
+  std::unique_ptr<Worker> worker_;
+  std::set<std::thread::id> allowed_threads_;
+
+  std::mutex mutex_;
+  size_t memory_limit_;
+  size_t max_pool_size_;
+  BufferCache<CudaBuffer> buffer_cache_;
+  size_t active_memory_{0};
+  size_t peak_memory_{0};
+};
+
+CudaAllocator& allocator();
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@ -0,0 +1,188 @@
+// Copyright © 2025 Apple Inc.
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/iterators/strided_iterator.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <cooperative_groups.h>
+#include <nvtx3/nvtx3.hpp>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+
+#include <cassert>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename T>
+struct IndexValPair {
+  uint32_t index;
+  T val;
+};
+
+template <typename T>
+struct ArgMin {
+  constexpr __device__ T init() {
+    return Limits<T>::max();
+  }
+
+  __device__ IndexValPair<T> operator()(
+      const IndexValPair<T>& best,
+      const IndexValPair<T>& current) {
+    if (best.val > current.val ||
+        (best.val == current.val && best.index > current.index)) {
+      return current;
+    } else {
+      return best;
+    }
+  }
+
+  template <int N>
+  __device__ IndexValPair<T>
+  reduce_many(IndexValPair<T> best, T (&vals)[N], uint32_t offset) {
+    for (int i = 0; i < N; i++) {
+      if (vals[i] < best.val) {
+        best.val = vals[i];
+        best.index = offset + i;
+      }
+    }
+    return best;
+  }
+};
+
+template <typename T>
+struct ArgMax {
+  constexpr __device__ T init() {
+    return Limits<T>::min();
+  }
+
+  __device__ IndexValPair<T> operator()(
+      const IndexValPair<T>& best,
+      const IndexValPair<T>& current) {
+    if (best.val < current.val ||
+        (best.val == current.val && best.index > current.index)) {
+      return current;
+    } else {
+      return best;
+    }
+  }
+
+  template <int N>
+  __device__ IndexValPair<T>
+  reduce_many(IndexValPair<T> best, T (&vals)[N], uint32_t offset) {
+    for (int i = 0; i < N; i++) {
+      if (vals[i] > best.val) {
+        best.val = vals[i];
+        best.index = offset + i;
+      }
+    }
+    return best;
+  }
+};
+
+template <typename T, typename Op, int BLOCK_DIM, int N_READS = 4>
+__global__ void arg_reduce_general(
+    const T* in,
+    uint32_t* out,
+    size_t size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides in_strides,
+    const __grid_constant__ Strides out_strides,
+    int32_t ndim,
+    int64_t axis_stride,
+    int32_t axis_size) {
+  auto block = cg::this_thread_block();
+
+  int64_t index = cg::this_grid().block_rank();
+  if (index >= size) {
+    return;
+  }
+
+  int64_t in_idx = elem_to_loc(index, shape.data(), in_strides.data(), ndim);
+  int64_t out_idx = elem_to_loc(index, shape.data(), out_strides.data(), ndim);
+
+  Op op;
+  T init = op.init();
+  IndexValPair<T> best{0, init};
+
+  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
+    T vals[N_READS];
+    auto tid = r * BLOCK_DIM + block.thread_index().x;
+    cub::LoadDirectBlocked(
+        tid, strided_iterator(in + in_idx, axis_stride), vals, axis_size, init);
+    best = op.reduce_many(best, vals, tid * N_READS);
+  }
+
+  typedef cub::BlockReduce<IndexValPair<T>, BLOCK_DIM> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp;
+
+  best = BlockReduceT(temp).Reduce(best, op);
+
+  if (block.thread_rank() == 0) {
+    out[out_idx] = best.index;
+  }
+}
+
+} // namespace cu
+
+void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("ArgReduce::eval_gpu");
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  out.set_data(allocator::malloc(out.nbytes()));
+  auto& s = stream();
+
+  // Prepare the shapes, strides and axis arguments.
+  Shape shape = remove_index(in.shape(), axis_);
+  Strides in_strides = remove_index(in.strides(), axis_);
+  Strides out_strides = out.ndim() == in.ndim()
+      ? remove_index(out.strides(), axis_)
+      : out.strides();
+  int64_t axis_stride = in.strides()[axis_];
+  int32_t axis_size = in.shape()[axis_];
+  int32_t ndim = shape.size();
+
+  // ArgReduce.
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_REAL_TYPES_CHECKED(in.dtype(), "ArgReduce", CTYPE, {
+      using InType = cuda_type_t<CTYPE>;
+      constexpr uint32_t N_READS = 4;
+      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
+        dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
+        dim3 block_dims{BLOCK_DIM, 1, 1};
+        auto kernel = &cu::arg_reduce_general<
+            InType,
+            cu::ArgMax<InType>,
+            BLOCK_DIM,
+            N_READS>;
+        if (reduce_type_ == ArgReduce::ArgMin) {
+          kernel = &cu::arg_reduce_general<
+              InType,
+              cu::ArgMin<InType>,
+              BLOCK_DIM,
+              N_READS>;
+        }
+        kernel<<<num_blocks, block_dims, 0, stream>>>(
+            in.data<InType>(),
+            out.data<uint32_t>(),
+            out.size(),
+            const_param(shape),
+            const_param(in_strides),
+            const_param(out_strides),
+            ndim,
+            axis_stride,
+            axis_size);
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/bin2h.cmake
+++ b/mlx/backend/cuda/bin2h.cmake
@ -0,0 +1,150 @@
+# Based on: https://github.com/sivachandran/cmake-bin2h
+#
+# Copyright 2020 Sivachandran Paramasivam
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+include(CMakeParseArguments)
+
+# Function to wrap a given string into multiple lines at the given column
+# position.
+#
+# Parameters:
+#
+# * VARIABLE - The name of the CMake variable holding the string.
+# * AT_COLUMN - The column position at which string will be wrapped.
+function(WRAP_STRING)
+  set(oneValueArgs VARIABLE AT_COLUMN)
+  cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
+
+  string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
+  math(EXPR offset "0")
+
+  while(stringLength GREATER 0)
+    if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
+      math(EXPR length "${WRAP_STRING_AT_COLUMN}")
+    else()
+      math(EXPR length "${stringLength}")
+    endif()
+
+    string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
+    set(lines "${lines}\n ${line}")
+
+    math(EXPR stringLength "${stringLength} - ${length}")
+    math(EXPR offset "${offset} + ${length}")
+  endwhile()
+
+  set(${WRAP_STRING_VARIABLE}
+      "${lines}"
+      PARENT_SCOPE)
+endfunction()
+
+# Function to embed contents of a file as byte array in C/C++ header file(.h).
+# The header file will contain a byte array and integer variable holding the
+# size of the array.
+#
+# Parameters:
+#
+# * SOURCE_FILES - The paths of source files whose contents will be embedded in
+#   the header file.
+# * VARIABLE_NAME - The name of the variable for the byte array. The string
+#   "_SIZE" will be append to this name and will be used a variable name for
+#   size variable.
+# * HEADER_FILE - The path of header file.
+# * APPEND - If specified appends to the header file instead of overwriting it
+# * HEADER_NAMESPACE - The namespace, where the array should be located in.
+# * NULL_TERMINATE - If specified a null byte(zero) will be append to the byte
+#   array.
+#
+# Usage:
+#
+# bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
+function(BIN2H)
+  set(options APPEND NULL_TERMINATE)
+  set(oneValueArgs VARIABLE_NAME HEADER_FILE HEADER_NAMESPACE)
+  set(multiValueArgs SOURCE_FILES)
+  cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+
+  set(arrayDefinition "")
+  foreach(SOURCE_FILE IN LISTS BIN2H_SOURCE_FILES)
+    # get filename without extension
+    get_filename_component(FILE_NAME_WE ${SOURCE_FILE} NAME_WE)
+    # convert the filename to a valid C identifier
+    string(MAKE_C_IDENTIFIER "${FILE_NAME_WE}" VALID_FILE_NAME)
+
+    # reads source file contents as hex string
+    file(READ ${SOURCE_FILE} hexString HEX)
+
+    # append null
+    if(BIN2H_NULL_TERMINATE)
+      string(APPEND hexString "00")
+    endif()
+
+    # wraps the hex string into multiple lines
+    wrap_string(VARIABLE hexString AT_COLUMN 24)
+
+    # strip the © in source code
+    string(REGEX REPLACE "c2a9" "2020" arrayValues ${hexString})
+
+    string(REGEX REPLACE "([0-9a-f][0-9a-f])" " 0x\\1," arrayValues
+                         ${arrayValues})
+
+    # make a full variable name for the array
+    set(FULL_VARIABLE_NAME "${BIN2H_VARIABLE_NAME}_${VALID_FILE_NAME}")
+
+    # declares byte array and the length variables
+    string(APPEND arrayDefinition
+           "constexpr char ${FULL_VARIABLE_NAME}[] = {${arrayValues}\n};\n\n")
+  endforeach()
+
+  # add namespace wrapper if defined
+  if(DEFINED BIN2H_HEADER_NAMESPACE)
+    set(namespaceStart "namespace ${BIN2H_HEADER_NAMESPACE} {")
+    set(namespaceEnd "} // namespace ${BIN2H_HEADER_NAMESPACE}")
+    set(declarations "${namespaceStart}\n\n${arrayDefinition}${namespaceEnd}\n")
+  endif()
+
+  set(arrayIncludes "#pragma once")
+  string(PREPEND declarations "${arrayIncludes}\n\n")
+
+  if(BIN2H_APPEND)
+    file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
+  else()
+    file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
+  endif()
+endfunction()
+
+# ----------------------------- CLI args -----------------------------
+
+string(REPLACE ":" ";" MLX_JIT_SOURCES_LIST ${MLX_JIT_SOURCES})
+foreach(source ${MLX_JIT_SOURCES_LIST})
+  list(APPEND MLX_JIT_SOURCES_ABS "${MLX_SOURCE_ROOT}/${source}")
+endforeach()
+
+bin2h(
+  SOURCE_FILES
+  ${MLX_JIT_SOURCES_ABS}
+  NULL_TERMINATE
+  VARIABLE_NAME
+  "jit_source"
+  HEADER_NAMESPACE
+  "mlx::core"
+  HEADER_FILE
+  "${CMAKE_CURRENT_BINARY_DIR}/gen/cuda_jit_sources.h")
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@ -0,0 +1,292 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/binary_ops.cuh"
+#include "mlx/backend/cuda/device/cucomplex_math.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <cooperative_groups.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void binary_ss(const In* a, const In* b, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    out[index] = Op{}(a[0], b[0]);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void binary_sv(const In* a, const In* b, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    out[index] = Op{}(a[0], b[index]);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void binary_vs(const In* a, const In* b, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    out[index] = Op{}(a[index], b[0]);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void binary_vv(const In* a, const In* b, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    out[index] = Op{}(a[index], b[index]);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
+__global__ void binary_g_nd(
+    const In* a,
+    const In* b,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), a_strides.data(), b_strides.data());
+    out[index] = Op{}(a[a_idx], b[b_idx]);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void binary_g(
+    const In* a,
+    const In* b,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides a_strides,
+    const __grid_constant__ Strides b_strides,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [a_idx, b_idx] = elem_to_loc_4d(
+        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
+    out[index] = Op{}(a[a_idx], b[b_idx]);
+  }
+}
+
+template <typename Op, typename In, typename Out>
+constexpr bool supports_binary_op() {
+  if (std::is_same_v<Op, Add> || std::is_same_v<Op, Divide> ||
+      std::is_same_v<Op, Maximum> || std::is_same_v<Op, Minimum> ||
+      std::is_same_v<Op, Multiply> || std::is_same_v<Op, Subtract> ||
+      std::is_same_v<Op, Power> || std::is_same_v<Op, Remainder>) {
+    return std::is_same_v<In, Out>;
+  }
+  if (std::is_same_v<Op, Equal> || std::is_same_v<Op, Greater> ||
+      std::is_same_v<Op, GreaterEqual> || std::is_same_v<Op, Less> ||
+      std::is_same_v<Op, LessEqual> || std::is_same_v<Op, NotEqual>) {
+    return std::is_same_v<Out, bool>;
+  }
+  if (std::is_same_v<Op, LogicalAnd> || std::is_same_v<Op, LogicalOr>) {
+    return std::is_same_v<Out, bool> && std::is_same_v<In, bool>;
+  }
+  if (std::is_same_v<Op, NaNEqual>) {
+    return std::is_same_v<Out, bool> && is_inexact_v<In>;
+  }
+  if (std::is_same_v<Op, LogAddExp>) {
+    return std::is_same_v<In, Out> && is_inexact_v<In>;
+  }
+  if (std::is_same_v<Op, ArcTan2>) {
+    return std::is_same_v<In, Out> && is_floating_v<In>;
+  }
+  if (std::is_same_v<Op, BitwiseAnd> || std::is_same_v<Op, BitwiseOr> ||
+      std::is_same_v<Op, BitwiseXor>) {
+    return std::is_same_v<In, Out> && std::is_integral_v<In>;
+  }
+  if (std::is_same_v<Op, LeftShift> || std::is_same_v<Op, RightShift>) {
+    return std::is_same_v<In, Out> && std::is_integral_v<In> &&
+        !std::is_same_v<In, bool>;
+  }
+  return false;
+}
+
+} // namespace cu
+
+template <typename Op>
+void binary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    array& out,
+    std::string_view op,
+    const Stream& s) {
+  assert(inputs.size() > 1);
+  const auto& a = inputs[0];
+  const auto& b = inputs[1];
+  if (out.size() == 0) {
+    return;
+  }
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_ALL_TYPES(a.dtype(), CTYPE_IN, {
+      MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {
+        if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+          using InType = cuda_type_t<CTYPE_IN>;
+          using OutType = cuda_type_t<CTYPE_OUT>;
+          auto bopt = get_binary_op_type(a, b);
+          if (bopt == BinaryOpType::General) {
+            auto [shape, strides] = collapse_contiguous_dims(a, b, out);
+            auto& a_strides = strides[0];
+            auto& b_strides = strides[1];
+            bool large = a.data_size() > INT32_MAX ||
+                b.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
+            MLX_SWITCH_BOOL(large, LARGE, {
+              using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
+              int ndim = shape.size();
+              if (ndim <= 3) {
+                MLX_SWITCH_1_2_3(ndim, NDIM, {
+                  auto kernel =
+                      &cu::binary_g_nd<Op, InType, OutType, IdxT, NDIM>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out, large);
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      a.data<InType>(),
+                      b.data<InType>(),
+                      out.data<OutType>(),
+                      out.size(),
+                      const_param<NDIM>(shape),
+                      const_param<NDIM>(a_strides),
+                      const_param<NDIM>(b_strides));
+                });
+              } else {
+                auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out, large);
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                    a.data<InType>(),
+                    b.data<InType>(),
+                    out.data<OutType>(),
+                    out.size(),
+                    const_param(shape),
+                    const_param(a_strides),
+                    const_param(b_strides),
+                    ndim);
+              }
+            });
+          } else {
+            MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
+              using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+              auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
+              if (bopt == BinaryOpType::ScalarVector) {
+                kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
+              } else if (bopt == BinaryOpType::VectorScalar) {
+                kernel = cu::binary_vs<Op, InType, OutType, IdxT>;
+              } else if (bopt == BinaryOpType::VectorVector) {
+                kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
+              }
+              auto [num_blocks, block_dims] = get_launch_args(
+                  kernel, out.data_size(), out.shape(), out.strides(), LARGE);
+              kernel<<<num_blocks, block_dims, 0, stream>>>(
+                  a.data<InType>(),
+                  b.data<InType>(),
+                  out.data<OutType>(),
+                  out.data_size());
+            });
+          }
+        } else {
+          throw std::runtime_error(fmt::format(
+              "Can not do binary op {} on inputs of {} with result of {}.",
+              op,
+              dtype_to_string(a.dtype()),
+              dtype_to_string(out.dtype())));
+        }
+      });
+    });
+  });
+}
+
+template <typename Op>
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    std::string_view op,
+    const Stream& s) {
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+  binary_op_gpu_inplace<Op>(inputs, out, op, s);
+}
+
+#define BINARY_GPU(func)                                                 \
+  void func::eval_gpu(const std::vector<array>& inputs, array& out) {    \
+    nvtx3::scoped_range r(#func "::eval_gpu");                           \
+    auto& s = out.primitive().stream();                                  \
+    binary_op_gpu<cu::func>(inputs, out, get_primitive_string(this), s); \
+  }
+
+BINARY_GPU(Add)
+BINARY_GPU(ArcTan2)
+BINARY_GPU(Divide)
+BINARY_GPU(Remainder)
+BINARY_GPU(Greater)
+BINARY_GPU(GreaterEqual)
+BINARY_GPU(Less)
+BINARY_GPU(LessEqual)
+BINARY_GPU(LogicalAnd)
+BINARY_GPU(LogicalOr)
+BINARY_GPU(LogAddExp)
+BINARY_GPU(Maximum)
+BINARY_GPU(Minimum)
+BINARY_GPU(Multiply)
+BINARY_GPU(NotEqual)
+BINARY_GPU(Power)
+BINARY_GPU(Subtract)
+
+void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Equal::eval_gpu");
+  auto& s = out.primitive().stream();
+  auto op = get_primitive_string(this);
+  if (equal_nan_) {
+    binary_op_gpu<cu::NaNEqual>(inputs, out, op, s);
+  } else {
+    binary_op_gpu<cu::Equal>(inputs, out, op, s);
+  }
+}
+
+void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
+  auto& s = out.primitive().stream();
+  auto op = get_primitive_string(this);
+  switch (op_) {
+    case BitwiseBinary::And:
+      binary_op_gpu<cu::BitwiseAnd>(inputs, out, op, s);
+      break;
+    case BitwiseBinary::Or:
+      binary_op_gpu<cu::BitwiseOr>(inputs, out, op, s);
+      break;
+    case BitwiseBinary::Xor:
+      binary_op_gpu<cu::BitwiseXor>(inputs, out, op, s);
+      break;
+    case BitwiseBinary::LeftShift:
+      binary_op_gpu<cu::LeftShift>(inputs, out, op, s);
+      break;
+    case BitwiseBinary::RightShift:
+      binary_op_gpu<cu::RightShift>(inputs, out, op, s);
+      break;
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@ -0,0 +1,248 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/binary_ops.cuh"
+#include "mlx/backend/cuda/device/cucomplex_math.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <cooperative_groups.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void
+binary_ss(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto out = Op{}(a[0], b[0]);
+    out_a[0] = out[0];
+    out_b[0] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void
+binary_sv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto out = Op{}(a[0], b[index]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void
+binary_vs(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto out = Op{}(a[index], b[0]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void
+binary_vv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto out = Op{}(a[index], b[index]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
+__global__ void binary_g_nd(
+    const In* a,
+    const In* b,
+    Out* out_a,
+    Out* out_b,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), a_strides.data(), b_strides.data());
+    auto out = Op{}(a[a_idx], b[b_idx]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void binary_g(
+    const In* a,
+    const In* b,
+    Out* out_a,
+    Out* out_b,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides a_strides,
+    const __grid_constant__ Strides b_strides,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [a_idx, b_idx] = elem_to_loc_4d(
+        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
+    auto out = Op{}(a[a_idx], b[b_idx]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out>
+constexpr bool supports_binary_op() {
+  if (std::is_same_v<Op, DivMod>) {
+    return std::is_same_v<In, Out> &&
+        (std::is_integral_v<Out> || is_floating_v<Out>);
+  }
+  return false;
+}
+
+} // namespace cu
+
+template <typename Op>
+void binary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    std::string_view op,
+    const Stream& s) {
+  assert(inputs.size() > 1);
+  const auto& a = inputs[0];
+  const auto& b = inputs[1];
+  auto& out_a = outputs[0];
+  auto& out_b = outputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out_a, bopt);
+  set_binary_op_output_data(a, b, out_b, bopt);
+
+  if (out_a.size() == 0) {
+    return;
+  }
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out_a);
+  encoder.set_output_array(out_b);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_ALL_TYPES(a.dtype(), CTYPE_IN, {
+      MLX_SWITCH_ALL_TYPES(out_a.dtype(), CTYPE_OUT, {
+        if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+          using InType = cuda_type_t<CTYPE_IN>;
+          using OutType = cuda_type_t<CTYPE_OUT>;
+
+          auto bopt = get_binary_op_type(a, b);
+          if (bopt == BinaryOpType::General) {
+            auto [shape, strides] = collapse_contiguous_dims(a, b, out_a);
+            auto& a_strides = strides[0];
+            auto& b_strides = strides[1];
+            bool large = a.data_size() > INT32_MAX ||
+                b.data_size() > INT32_MAX || out_a.data_size() > INT32_MAX;
+            MLX_SWITCH_BOOL(large, LARGE, {
+              using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
+              int ndim = shape.size();
+              if (ndim <= 3) {
+                MLX_SWITCH_1_2_3(ndim, NDIM, {
+                  auto kernel =
+                      &cu::binary_g_nd<Op, InType, OutType, IdxT, NDIM>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out_a, large);
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      a.data<InType>(),
+                      b.data<InType>(),
+                      out_a.data<OutType>(),
+                      out_b.data<OutType>(),
+                      out_a.size(),
+                      const_param<NDIM>(shape),
+                      const_param<NDIM>(a_strides),
+                      const_param<NDIM>(b_strides));
+                });
+              } else {
+                auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out_a, large);
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                    a.data<InType>(),
+                    b.data<InType>(),
+                    out_a.data<OutType>(),
+                    out_b.data<OutType>(),
+                    out_a.size(),
+                    const_param(shape),
+                    const_param(a_strides),
+                    const_param(b_strides),
+                    ndim);
+              }
+            });
+          } else {
+            MLX_SWITCH_BOOL(out_a.data_size() > UINT32_MAX, LARGE, {
+              using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+              auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
+              if (bopt == BinaryOpType::ScalarVector) {
+                kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
+              } else if (bopt == BinaryOpType::VectorScalar) {
+                kernel = cu::binary_vs<Op, InType, OutType, IdxT>;
+              } else if (bopt == BinaryOpType::VectorVector) {
+                kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
+              }
+              auto [num_blocks, block_dims] = get_launch_args(
+                  kernel,
+                  out_a.data_size(),
+                  out_a.shape(),
+                  out_a.strides(),
+                  LARGE);
+              kernel<<<num_blocks, block_dims, 0, stream>>>(
+                  a.data<InType>(),
+                  b.data<InType>(),
+                  out_a.data<OutType>(),
+                  out_b.data<OutType>(),
+                  out_a.data_size());
+            });
+          }
+        } else {
+          throw std::runtime_error(fmt::format(
+              "Can not do binary op {} on inputs of {} with result of {}.",
+              op,
+              dtype_to_string(a.dtype()),
+              dtype_to_string(out_a.dtype())));
+        }
+      });
+    });
+  });
+}
+
+template <typename Op>
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    std::string_view op,
+    const Stream& s) {
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, outputs[0], bopt);
+  set_binary_op_output_data(a, b, outputs[1], bopt);
+  binary_op_gpu_inplace<Op>(inputs, outputs, op, s);
+}
+
+void DivMod::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  nvtx3::scoped_range r("DivMod::eval_gpu");
+  auto& s = outputs[0].primitive().stream();
+  binary_op_gpu<cu::DivMod>(inputs, outputs, get_primitive_string(this), s);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@ -0,0 +1,230 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/compiled.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/jit_module.h"
+#include "mlx/graph_utils.h"
+#include "mlx/primitives.h"
+
+#include <fmt/format.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace cu {
+
+struct FusedKernelBuilder {
+  std::string os;
+  const std::string& kernel_name;
+  const std::vector<array>& inputs;
+  const std::vector<array>& outputs;
+  const std::vector<array>& tape;
+  const std::function<bool(size_t)>& is_constant;
+
+  void build(const char* name, bool contiguous) {
+    NodeNamer namer;
+
+    // Function parameters.
+    std::vector<std::string> params;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (is_constant(i)) {
+        continue;
+      }
+      const auto& x = inputs[i];
+      const std::string& xname = namer.get_name(x);
+      params.push_back(
+          fmt::format("const {}* {}", dtype_to_cuda_type(x.dtype()), xname));
+      if (!is_scalar(x) && !contiguous) {
+        params.push_back(fmt::format(
+            "const __grid_constant__ cuda::std::array<int64_t, NDIM> {}_strides",
+            xname));
+      }
+    }
+    for (const auto& x : outputs) {
+      params.push_back(fmt::format(
+          "{}* {}", dtype_to_cuda_type(x.dtype()), namer.get_name(x)));
+    }
+    if (!contiguous) {
+      params.push_back(
+          "const __grid_constant__ cuda::std::array<int32_t, NDIM> shape");
+    }
+    params.push_back("IdxT size");
+
+    // Build function signature.
+    if (contiguous) {
+      os += "template <typename IdxT = uint32_t>\n";
+    } else {
+      os += "template <int NDIM, typename IdxT = uint32_t>\n";
+    }
+    os += fmt::format("__global__ void {}(\n", kernel_name + name);
+    for (size_t i = 0; i < params.size(); ++i) {
+      os += "    ";
+      os += params[i];
+      if (i != params.size() - 1) {
+        os += ",\n";
+      }
+    }
+    os += ") {\n";
+
+    // Index.
+    os +=
+        "  IdxT index = cg::this_grid().thread_rank();\n"
+        "  if (index >= size) {\n"
+        "    return;\n"
+        "  }\n";
+
+    // Read inputs.
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      const auto& x = inputs[i];
+      const std::string& xname = namer.get_name(x);
+      std::string type = dtype_to_cuda_type(x.dtype());
+      std::string value;
+      if (is_constant(i)) {
+        std::ostringstream ss;
+        print_constant(ss, x);
+        value = fmt::format("static_cast<{}>({})", type, ss.str());
+      } else if (is_scalar(x)) {
+        value = fmt::format("{}[0]", xname);
+      } else if (contiguous) {
+        value = fmt::format("{}[index]", xname);
+      } else {
+        std::string index = fmt::format(
+            "elem_to_loc_nd<NDIM>(index, shape.data(), {}_strides.data())",
+            xname);
+        value = fmt::format("{}[{}]", xname, index);
+      }
+      os += fmt::format("  {} tmp_{} = {};\n", type, xname, value);
+    }
+
+    // Write tape.
+    for (const auto& x : tape) {
+      const std::string& xname = namer.get_name(x);
+      std::string type = dtype_to_cuda_type(x.dtype());
+      std::string value;
+      if (is_static_cast(x.primitive())) {
+        value = fmt::format(
+            "static_cast<{}>(tmp_{})", type, namer.get_name(x.inputs()[0]));
+      } else {
+        std::ostringstream ss;
+        x.primitive().print(ss);
+        value = ss.str();
+        value += "{}(";
+        for (size_t i = 0; i < x.inputs().size() - 1; ++i) {
+          value += fmt::format("tmp_{}, ", namer.get_name(x.inputs()[i]));
+        }
+        value += fmt::format("tmp_{})", namer.get_name(x.inputs().back()));
+      }
+      os += fmt::format("  {} tmp_{} = {};\n", type, xname, value);
+    }
+
+    // Write output.
+    for (const auto& x : outputs) {
+      os += fmt::format("  {0}[index] = tmp_{0};\n", namer.get_name(x));
+    }
+
+    os += "}\n";
+  }
+};
+
+} // namespace cu
+
+constexpr const char* g_jit_includes = R"(
+#include "mlx/backend/cuda/device/binary_ops.cuh"
+#include "mlx/backend/cuda/device/ternary_ops.cuh"
+#include "mlx/backend/cuda/device/unary_ops.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+#define inf cuda::std::numeric_limits<float>::infinity()
+)";
+
+void Compiled::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  nvtx3::scoped_range r("Compiled::eval_gpu");
+  auto& s = stream();
+
+  cu::JitModule& mod = cu::get_jit_module(s.device, lib_name(), [&]() {
+    // Build source code.
+    cu::FusedKernelBuilder builder{
+        g_jit_includes, lib_name(), inputs_, outputs_, tape_, is_constant_};
+    builder.os +=
+        "namespace mlx::core::cu {\n\n"
+        "namespace cg = cooperative_groups;\n\n";
+    builder.build("_contiguous", true);
+    builder.os += "\n";
+    builder.build("_strided", false);
+    builder.os += "\n} // namespace mlx::core::cu\n";
+    // Build kernel names.
+    std::vector<std::string> kernel_names = {
+        fmt::format("mlx::core::cu::{}_contiguous<uint32_t>", lib_name()),
+        fmt::format("mlx::core::cu::{}_contiguous<int64_t>", lib_name()),
+    };
+    for (int i = 1; i <= MAX_NDIM; ++i) {
+      kernel_names.push_back(fmt::format(
+          "mlx::core::cu::{}_strided<{}, uint32_t>", lib_name(), i));
+      kernel_names.push_back(
+          fmt::format("mlx::core::cu::{}_strided<{}, int64_t>", lib_name(), i));
+    }
+    return std::make_pair(std::move(builder.os), std::move(kernel_names));
+  });
+
+  // Collapse contiguous dims to route to a faster kernel if possible. Also
+  // handle all broadcasting.
+  auto [contiguous, shape, strides_vec] =
+      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);
+
+  // Whether to use large index.
+  bool large = compiled_use_large_index(inputs, outputs, contiguous);
+
+  // Put inputs.
+  int strides_index = 1;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (is_constant_(i)) {
+      continue;
+    }
+    const auto& x = inputs[i];
+    mod.append_arg(x);
+    if (!contiguous && !is_scalar(x)) {
+      mod.append_arg(strides_vec[strides_index++]);
+    }
+  }
+
+  // Put outputs.
+  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);
+  for (auto& x : outputs) {
+    mod.append_arg(x);
+  }
+
+  // Put shape and size.
+  if (!contiguous) {
+    mod.append_arg(shape);
+  }
+  if (large) {
+    mod.append_arg<int64_t>(outputs[0].data_size());
+  } else {
+    mod.append_arg<uint32_t>(outputs[0].data_size());
+  }
+
+  // Launch kernel.
+  const char* index_type = large ? "int64_t" : "uint32_t";
+  std::string kernel_name = fmt::format("mlx::core::cu::{}", lib_name());
+  if (contiguous) {
+    kernel_name += fmt::format("_contiguous<{}>", index_type);
+  } else {
+    kernel_name += fmt::format("_strided<{}, {}>", shape.size(), index_type);
+  }
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  for (const auto& out : outputs) {
+    encoder.set_output_array(out);
+  }
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    mod.launch_kernel(stream, kernel_name, outputs[0], large);
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@ -0,0 +1,87 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+namespace mlx::core {
+
+void copy_gpu_inplace(
+    const array& in,
+    array& out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out,
+    int64_t offset_in,
+    int64_t offset_out,
+    CopyType ctype,
+    const Stream& s,
+    const std::optional<array>& dynamic_offset_in,
+    const std::optional<array>& dynamic_offset_out) {
+  if (out.size() == 0) {
+    return;
+  }
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  if (ctype == CopyType::Scalar || ctype == CopyType::Vector) {
+    copy_contiguous(encoder, ctype, in, out, offset_in, offset_out);
+    return;
+  }
+
+  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
+    auto [shape_collapsed, strides_vec] = collapse_contiguous_dims(
+        shape, std::vector{strides_in, strides_out}, INT32_MAX);
+    if (ctype == CopyType::General) {
+      copy_general_input(
+          encoder,
+          ctype,
+          in,
+          out,
+          offset_in,
+          offset_out,
+          shape_collapsed,
+          strides_vec[0]);
+    } else {
+      if (dynamic_offset_in || dynamic_offset_out) {
+        copy_general_dynamic(
+            encoder,
+            ctype,
+            in,
+            out,
+            offset_in,
+            offset_out,
+            shape_collapsed,
+            strides_vec[0],
+            strides_vec[1],
+            dynamic_offset_in ? *dynamic_offset_in : array(0, int64),
+            dynamic_offset_out ? *dynamic_offset_out : array(0, int64));
+      } else {
+        copy_general(
+            encoder,
+            ctype,
+            in,
+            out,
+            offset_in,
+            offset_out,
+            shape_collapsed,
+            strides_vec[0],
+            strides_vec[1]);
+      }
+    }
+    return;
+  }
+}
+
+void fill_gpu(const array& in, array& out, const Stream& s) {
+  if (out.size() == 0) {
+    return;
+  }
+  out.set_data(allocator::malloc(out.nbytes()));
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy.cuh
+++ b/mlx/backend/cuda/copy/copy.cuh
@ -0,0 +1,64 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/cast_op.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/backend/gpu/copy.h"
+#include "mlx/dtype_utils.h"
+
+namespace mlx::core {
+
+#define MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, ...) \
+  MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE_IN, {               \
+    MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {           \
+      using InType = cuda_type_t<CTYPE_IN>;                  \
+      using OutType = cuda_type_t<CTYPE_OUT>;                \
+      __VA_ARGS__;                                           \
+    });                                                      \
+  })
+
+void copy_contiguous(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out);
+
+void copy_general(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out);
+
+void copy_general_dynamic(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out,
+    const array& dynamic_offset_in,
+    const array& dynamic_offset_out);
+
+void copy_general_input(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in);
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@ -0,0 +1,57 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_s(const In* in, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    out[index] = CastOp<In, Out>{}(in[0]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_v(const In* in, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    out[index] = CastOp<In, Out>{}(in[index]);
+  }
+}
+
+} // namespace cu
+
+void copy_contiguous(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t in_offset,
+    int64_t out_offset) {
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
+      MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
+        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+        auto kernel = cu::copy_s<InType, OutType, IdxT>;
+        if (ctype == CopyType::Vector) {
+          kernel = cu::copy_v<InType, OutType, IdxT>;
+        }
+        auto [num_blocks, block_dims] = get_launch_args(
+            kernel, out.data_size(), out.shape(), out.strides(), LARGE);
+        kernel<<<num_blocks, block_dims, 0, stream>>>(
+            in.data<InType>() + in_offset,
+            out.data<OutType>() + out_offset,
+            out.data_size());
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@ -0,0 +1,100 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT, int NDIM>
+__global__ void copy_gg_nd(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), strides_in.data(), strides_out.data());
+    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_gg(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides_in,
+    const __grid_constant__ Strides strides_out,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_4d(
+        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
+    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+} // namespace cu
+
+void copy_general(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out) {
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
+      const InType* in_ptr = in.data<InType>() + offset_in;
+      OutType* out_ptr = out.data<OutType>() + offset_out;
+      bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
+      MLX_SWITCH_BOOL(large, LARGE, {
+        using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
+        int ndim = shape.size();
+        size_t data_size = 1;
+        for (auto& s : shape)
+          data_size *= s;
+        if (ndim <= 3) {
+          MLX_SWITCH_1_2_3(ndim, NDIM, {
+            auto kernel = cu::copy_gg_nd<InType, OutType, IdxT, NDIM>;
+            auto [num_blocks, block_dims] =
+                get_launch_args(kernel, data_size, shape, out.strides(), large);
+            kernel<<<num_blocks, block_dims, 0, stream>>>(
+                in_ptr,
+                out_ptr,
+                data_size,
+                const_param<NDIM>(shape),
+                const_param<NDIM>(strides_in),
+                const_param<NDIM>(strides_out));
+          });
+        } else { // ndim >= 4
+          auto kernel = cu::copy_gg<InType, OutType, IdxT>;
+          auto [num_blocks, block_dims] =
+              get_launch_args(kernel, data_size, shape, out.strides(), large);
+          kernel<<<num_blocks, block_dims, 0, stream>>>(
+              in_ptr,
+              out_ptr,
+              data_size,
+              const_param(shape),
+              const_param(strides_in),
+              const_param(strides_out),
+              ndim);
+        }
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@ -0,0 +1,105 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT, int NDIM>
+__global__ void copy_gg_dynamic_nd(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out,
+    const int64_t* offset_in,
+    const int64_t* offset_out) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), strides_in.data(), strides_out.data());
+    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_gg_dynamic(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides_in,
+    const __grid_constant__ Strides strides_out,
+    int ndim,
+    const int64_t* offset_in,
+    const int64_t* offset_out) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_4d(
+        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
+    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
+  }
+}
+
+} // namespace cu
+
+void copy_general_dynamic(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out,
+    const array& dynamic_offset_in,
+    const array& dynamic_offset_out) {
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
+      const InType* in_ptr = in.data<InType>() + offset_in;
+      OutType* out_ptr = out.data<OutType>() + offset_out;
+      bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
+      MLX_SWITCH_BOOL(large, LARGE, {
+        using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
+        int ndim = shape.size();
+        if (ndim <= 3) {
+          MLX_SWITCH_1_2_3(ndim, NDIM, {
+            auto kernel = cu::copy_gg_dynamic_nd<InType, OutType, IdxT, NDIM>;
+            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+            kernel<<<num_blocks, block_dims, 0, stream>>>(
+                in_ptr,
+                out_ptr,
+                out.size(),
+                const_param<NDIM>(shape),
+                const_param<NDIM>(strides_in),
+                const_param<NDIM>(strides_out),
+                dynamic_offset_in.data<int64_t>(),
+                dynamic_offset_out.data<int64_t>());
+          });
+        } else { // ndim >= 4
+          auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
+          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+          kernel<<<num_blocks, block_dims, 0, stream>>>(
+              in_ptr,
+              out_ptr,
+              out.size(),
+              const_param(shape),
+              const_param(strides_in),
+              const_param(strides_out),
+              ndim,
+              dynamic_offset_in.data<int64_t>(),
+              dynamic_offset_out.data<int64_t>());
+        }
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@ -0,0 +1,88 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT, int NDIM>
+__global__ void copy_g_nd(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    IdxT idx_in = elem_to_loc_nd<NDIM>(index, shape.data(), strides_in.data());
+    out[index] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_g(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides_in,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    IdxT idx_in = elem_to_loc_4d(index, shape.data(), strides_in.data(), ndim);
+    out[index] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+} // namespace cu
+
+void copy_general_input(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in) {
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
+      const InType* in_ptr = in.data<InType>() + offset_in;
+      OutType* out_ptr = out.data<OutType>() + offset_out;
+      bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
+      MLX_SWITCH_BOOL(large, LARGE, {
+        using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
+        int ndim = shape.size();
+        if (ndim <= 3) {
+          MLX_SWITCH_1_2_3(ndim, NDIM, {
+            auto kernel = cu::copy_g_nd<InType, OutType, IdxT, NDIM>;
+            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+            kernel<<<num_blocks, block_dims, 0, stream>>>(
+                in_ptr,
+                out_ptr,
+                out.size(),
+                const_param<NDIM>(shape),
+                const_param<NDIM>(strides_in));
+          });
+        } else { // ndim >= 4
+          auto kernel = cu::copy_g<InType, OutType, IdxT>;
+          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+          kernel<<<num_blocks, block_dims, 0, stream>>>(
+              in_ptr,
+              out_ptr,
+              out.size(),
+              const_param(shape),
+              const_param(strides_in),
+              ndim);
+        }
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/cuda.cpp
+++ b/mlx/backend/cuda/cuda.cpp
@ -0,0 +1,11 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/cuda.h"
+
+namespace mlx::core::cu {
+
+bool is_available() {
+  return true;
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/cuda.h
+++ b/mlx/backend/cuda/cuda.h
@ -0,0 +1,10 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+namespace mlx::core::cu {
+
+/* Check if the CUDA backend is available. */
+bool is_available();
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@ -0,0 +1,140 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/worker.h"
+#include "mlx/backend/metal/metal.h"
+
+#include <fmt/format.h>
+#include <nvtx3/nvtx3.hpp>
+#include <future>
+
+namespace mlx::core {
+
+namespace cu {
+
+DeviceStream::DeviceStream(Device& device) : device_(device), stream_(device) {}
+
+void DeviceStream::synchronize() {
+  cudaStreamSynchronize(stream_);
+}
+
+cudaStream_t DeviceStream::schedule_cuda_stream() {
+  // TODO: Return a stream that maximizes parallelism.
+  return stream_;
+}
+
+cudaStream_t DeviceStream::last_cuda_stream() {
+  return stream_;
+}
+
+CommandEncoder& DeviceStream::get_encoder() {
+  if (!encoder_) {
+    encoder_ = std::make_unique<CommandEncoder>(*this);
+  }
+  return *encoder_;
+}
+
+Device::Device(int device) : device_(device) {
+  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
+      &compute_capability_major_, cudaDevAttrComputeCapabilityMajor, device_));
+  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
+      &compute_capability_minor_, cudaDevAttrComputeCapabilityMinor, device_));
+  // Validate the requirements of device.
+  int attr = 0;
+  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
+      &attr, cudaDevAttrConcurrentManagedAccess, device_));
+  if (attr != 1) {
+    throw std::runtime_error(fmt::format(
+        "Device {} does not support synchronization in managed memory.",
+        device_));
+  }
+  // The cublasLt handle is used by matmul.
+  make_current();
+  cublasLtCreate(&lt_);
+}
+
+Device::~Device() {
+  cublasLtDestroy(lt_);
+}
+
+void Device::make_current() {
+  // We need to set/get current CUDA device very frequently, cache it to reduce
+  // actual calls of CUDA APIs. This function assumes single-thread in host.
+  static int current = 0;
+  if (current != device_) {
+    CHECK_CUDA_ERROR(cudaSetDevice(device_));
+    current = device_;
+  }
+}
+
+DeviceStream& Device::get_stream(Stream s) {
+  auto it = streams_.find(s.index);
+  if (it == streams_.end()) {
+    it = streams_.try_emplace(s.index, *this).first;
+  }
+  return it->second;
+}
+
+CommandEncoder::CommandEncoder(DeviceStream& s)
+    : device_(s.device()), stream_(s) {}
+
+void CommandEncoder::add_completed_handler(std::function<void()> task) {
+  worker_.add_task(std::move(task));
+}
+
+void CommandEncoder::end_encoding() {
+  if (!temporaries_.empty()) {
+    add_completed_handler([temporaries = std::move(temporaries_)]() {});
+  }
+
+  // There is no kernel running, run completion handlers immediately.
+  if (!has_gpu_work_) {
+    worker_.consume_in_this_thread();
+    return;
+  }
+  has_gpu_work_ = false;
+
+  // Put completion handlers in a batch.
+  worker_.end_batch();
+
+  // Signaling kernel completion is expensive, delay until enough batches.
+  // TODO: This number is arbitrarily picked, profile for a better stragety.
+  if (worker_.uncommited_batches() > 8) {
+    commit();
+  }
+}
+
+void CommandEncoder::commit() {
+  worker_.commit(stream_.last_cuda_stream());
+}
+
+void CommandEncoder::synchronize() {
+  stream().synchronize();
+  auto p = std::make_shared<std::promise<void>>();
+  std::future<void> f = p->get_future();
+  add_completed_handler([p = std::move(p)]() { p->set_value(); });
+  worker_.end_batch();
+  commit();
+  f.wait();
+}
+
+Device& device(mlx::core::Device device) {
+  static std::unordered_map<int, Device> devices;
+  auto it = devices.find(device.index);
+  if (it == devices.end()) {
+    it = devices.try_emplace(device.index, device.index).first;
+  }
+  return it->second;
+}
+
+DeviceStream& get_stream(Stream s) {
+  return device(s.device).get_stream(s);
+}
+
+CommandEncoder& get_command_encoder(Stream s) {
+  return get_stream(s).get_encoder();
+}
+
+} // namespace cu
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@ -0,0 +1,148 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+#include "mlx/backend/cuda/worker.h"
+#include "mlx/stream.h"
+
+#include <cublasLt.h>
+#include <thrust/execution_policy.h>
+
+#include <unordered_map>
+
+namespace mlx::core::cu {
+
+class Device;
+class CommandEncoder;
+
+class DeviceStream {
+ public:
+  explicit DeviceStream(Device& device);
+
+  DeviceStream(const DeviceStream&) = delete;
+  DeviceStream& operator=(const DeviceStream&) = delete;
+
+  // Wait until kernels in the stream complete.
+  void synchronize();
+
+  // Return a cuda stream for launching kernels.
+  cudaStream_t schedule_cuda_stream();
+
+  // Return the last cuda stream used.
+  cudaStream_t last_cuda_stream();
+
+  CommandEncoder& get_encoder();
+
+  Device& device() {
+    return device_;
+  }
+
+ private:
+  Device& device_;
+  CudaStream stream_;
+  std::unique_ptr<CommandEncoder> encoder_;
+};
+
+class Device {
+ public:
+  explicit Device(int device);
+  ~Device();
+
+  Device(const Device&) = delete;
+  Device& operator=(const Device&) = delete;
+
+  // Make this device the current cuda device, required by some cuda calls.
+  void make_current();
+
+  DeviceStream& get_stream(Stream s);
+
+  int cuda_device() const {
+    return device_;
+  }
+  int compute_capability_major() const {
+    return compute_capability_major_;
+  }
+  int compute_capability_minor() const {
+    return compute_capability_minor_;
+  }
+  cublasLtHandle_t lt_handle() const {
+    return lt_;
+  }
+
+ private:
+  int device_;
+  int compute_capability_major_;
+  int compute_capability_minor_;
+  cublasLtHandle_t lt_;
+  std::unordered_map<int, DeviceStream> streams_;
+};
+
+class CommandEncoder {
+ public:
+  explicit CommandEncoder(DeviceStream& stream);
+
+  CommandEncoder(const CommandEncoder&) = delete;
+  CommandEncoder& operator=(const CommandEncoder&) = delete;
+
+  void set_input_array(const array& arr) {}
+  void set_output_array(const array& arr) {}
+
+  void add_temporary(const array& arr) {
+    temporaries_.push_back(arr.data_shared_ptr());
+  }
+
+  void add_completed_handler(std::function<void()> task);
+  void end_encoding();
+  void commit();
+
+  // Schedule a cuda stream for |fun| to launch kernels, and check error
+  // afterwards.
+  template <typename F>
+  void launch_kernel(F&& fun) {
+    launch_kernel(stream_.schedule_cuda_stream(), std::forward<F>(fun));
+  }
+
+  template <typename F>
+  void launch_kernel(cudaStream_t stream, F&& fun) {
+    device_.make_current();
+    fun(stream);
+    check_cuda_error("kernel launch", cudaGetLastError());
+    has_gpu_work_ = true;
+  }
+
+  Device& device() {
+    return device_;
+  }
+
+  DeviceStream& stream() {
+    return stream_;
+  }
+
+  bool has_gpu_work() const {
+    return has_gpu_work_;
+  }
+
+  // Wait until kernels and completion handlers are finished
+  void synchronize();
+
+ private:
+  Device& device_;
+  DeviceStream& stream_;
+  Worker worker_;
+  bool has_gpu_work_{false};
+  std::vector<std::shared_ptr<array::Data>> temporaries_;
+};
+
+Device& device(mlx::core::Device device);
+DeviceStream& get_stream(Stream s);
+CommandEncoder& get_command_encoder(Stream s);
+
+// Return an execution policy that does not sync for result.
+// Note that not all thrust APIs support async policy, confirm before using.
+inline auto thrust_policy(cudaStream_t stream) {
+  // TODO: Connect thrust's custom allocator with mlx's allocator.
+  return thrust::cuda::par_nosync.on(stream);
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/arange.cuh
+++ b/mlx/backend/cuda/device/arange.cuh
@ -0,0 +1,15 @@
+// Copyright © 2025 Apple Inc.
+
+namespace mlx::core::cu {
+
+template <typename T>
+struct Arange {
+  const T start;
+  const T step;
+
+  __device__ T operator()(uint32_t i) const {
+    return start + i * step;
+  }
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/atomic_ops.cuh
+++ b/mlx/backend/cuda/device/atomic_ops.cuh
@ -0,0 +1,72 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/device/cucomplex_math.cuh"
+#include "mlx/backend/cuda/device/fp16_math.cuh"
+
+#include <cuda/atomic>
+
+namespace mlx::core::cu {
+
+template <typename T>
+inline __device__ void atomic_add(T* out, T val) {
+  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
+  ref += val;
+}
+
+template <typename T>
+inline __device__ void atomic_prod(T* out, T val) {
+  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
+  T old = ref.load();
+  while (!ref.compare_exchange_strong(old, old * val)) {
+  }
+}
+
+template <typename T>
+inline __device__ void atomic_max(T* out, T val) {
+  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
+  ref.fetch_max(val);
+}
+
+template <typename T>
+inline __device__ void atomic_min(T* out, T val) {
+  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
+  ref.fetch_min(val);
+}
+
+// Somehow cuda::atomic_ref does not provide atomic add for following types.
+template <typename T>
+inline __device__ void atomic_add_general(T* out, T val) {
+  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
+  T old = ref.load();
+  while (!ref.compare_exchange_strong(old, old + val)) {
+  }
+}
+
+inline __device__ void atomic_add(__half* out, __half val) {
+  atomicAdd(out, val);
+}
+
+inline __device__ void atomic_add(cuComplex* out, cuComplex val) {
+#if __CUDA_ARCH__ < 900
+  atomic_add_general(out, val);
+#else
+  atomicAdd(out, val);
+#endif
+}
+
+inline __device__ void atomic_add(__nv_bfloat16* out, __nv_bfloat16 val) {
+#if __CUDA_ARCH__ < 800
+#if CCCL_VERSION >= 2008000
+  atomic_add_general(out, val);
+#else
+  bool cccl_version_too_old_for_bfloat16_atomic_add = false;
+  assert(cccl_version_too_old_for_bfloat16_atomic_add);
+#endif
+#else
+  atomicAdd(out, val);
+#endif
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/binary_ops.cuh
+++ b/mlx/backend/cuda/device/binary_ops.cuh
@ -0,0 +1,307 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device/cucomplex_math.cuh"
+#include "mlx/backend/cuda/device/fp16_math.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cuComplex.h>
+#include <cuda/std/array>
+
+namespace mlx::core::cu {
+
+struct Add {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x + y;
+  }
+};
+
+struct FloorDivide {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      return x / y;
+    } else {
+      return truncf(x / y);
+    }
+  }
+};
+
+struct Divide {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x / y;
+  }
+};
+
+struct Remainder {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      if constexpr (cuda::std::is_signed_v<T>) {
+        auto r = x % y;
+        if (r != 0 && (r < 0 != y < 0)) {
+          r += y;
+        }
+        return r;
+      } else {
+        return x % y;
+      }
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return x % y;
+    } else {
+      T r = fmod(x, y);
+      if (r != 0 && (r < 0 != y < 0)) {
+        r = r + y;
+      }
+      return r;
+    }
+  }
+};
+
+struct Equal {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    return x == y;
+  }
+};
+
+struct NaNEqual {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    if constexpr (std::is_same_v<T, cuComplex>) {
+      return x == y ||
+          (isnan(cuCrealf(x)) && isnan(cuCrealf(y)) && isnan(cuCimagf(x)) &&
+           isnan(cuCimagf(y))) ||
+          (cuCrealf(x) == cuCrealf(y) && isnan(cuCimagf(x)) &&
+           isnan(cuCimagf(y))) ||
+          (isnan(cuCrealf(x)) && isnan(cuCrealf(y)) &&
+           cuCimagf(x) == cuCimagf(y));
+    } else {
+      return x == y || (isnan(x) && isnan(y));
+    }
+  }
+};
+
+struct Greater {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    return x > y;
+  }
+};
+
+struct GreaterEqual {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    return x >= y;
+  }
+};
+
+struct Less {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    return x < y;
+  }
+};
+
+struct LessEqual {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    return x <= y;
+  }
+};
+
+struct LogAddExp {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    if (isnan(x) || isnan(y)) {
+      return cuda::std::numeric_limits<T>::quiet_NaN();
+    }
+    T maxval = max(x, y);
+    T minval = min(x, y);
+    return (minval == -cuda::std::numeric_limits<T>::infinity() ||
+            maxval == cuda::std::numeric_limits<T>::infinity())
+        ? maxval
+        : T(float(maxval) + log1p(expf(minval - maxval)));
+  };
+
+  __device__ cuComplex operator()(cuComplex x, cuComplex y) {
+    if (isnan(cuCrealf(x)) || isnan(cuCimagf(x)) || isnan(cuCrealf(y)) ||
+        isnan(cuCimagf(y))) {
+      return {
+          cuda::std::numeric_limits<float>::quiet_NaN(),
+          cuda::std::numeric_limits<float>::quiet_NaN()};
+    }
+    float inf = cuda::std::numeric_limits<float>::infinity();
+    auto maxval = x > y ? x : y;
+    auto minval = x < y ? x : y;
+    if (cuCrealf(minval) == -inf || cuCrealf(maxval) == inf)
+      return maxval;
+    float m = exp(cuCrealf(minval) - cuCrealf(maxval));
+    cuComplex dexp{
+        m * cos(cuCimagf(minval) - cuCimagf(maxval)),
+        m * sin(cuCimagf(minval) - cuCimagf(maxval)),
+    };
+    return maxval + log1p(dexp);
+  }
+};
+
+struct Maximum {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      return max(x, y);
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      if (isnan(cuCrealf(x)) || isnan(cuCimagf(x))) {
+        return x;
+      }
+      return x > y ? x : y;
+    } else {
+      if (isnan(x)) {
+        return x;
+      }
+      return x > y ? x : y;
+    }
+  }
+};
+
+struct Minimum {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      return min(x, y);
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      if (isnan(cuCrealf(x)) || isnan(cuCimagf(x))) {
+        return x;
+      }
+      return x < y ? x : y;
+    } else {
+      if (isnan(x)) {
+        return x;
+      }
+      return x < y ? x : y;
+    }
+  }
+};
+
+struct Multiply {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x * y;
+  }
+};
+
+struct NotEqual {
+  template <typename T>
+  __device__ bool operator()(T x, T y) {
+    if constexpr (std::is_same_v<T, cuComplex>) {
+      return cuCrealf(x) != cuCrealf(y) || cuCimagf(x) != cuCimagf(y);
+    } else {
+      return x != y;
+    }
+  }
+};
+
+struct Power {
+  template <typename T>
+  __device__ T operator()(T base, T exp) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      T res = 1;
+      while (exp) {
+        if (exp & 1) {
+          res *= base;
+        }
+        exp >>= 1;
+        base *= base;
+      }
+      return res;
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      if (base.y == 0 && base.x == 0) {
+        if (isnan(exp.x) || isnan(exp.y)) {
+          auto nan = cuda::std::numeric_limits<float>::quiet_NaN();
+          return make_cuFloatComplex(nan, nan);
+        }
+        return make_cuFloatComplex(0.0, 0.0);
+      }
+      auto x_theta = atan2f(base.y, base.x);
+      auto x_ln_r = 0.5 * logf(base.x * base.x + base.y * base.y);
+      auto mag = expf(exp.x * x_ln_r - exp.y * x_theta);
+      auto phase = exp.y * x_ln_r + exp.x * x_theta;
+      return make_cuFloatComplex(mag * cosf(phase), mag * sinf(phase));
+    } else {
+      return powf(base, exp);
+    }
+  }
+};
+
+struct Subtract {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x - y;
+  }
+};
+
+struct LogicalAnd {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x && y;
+  };
+};
+
+struct LogicalOr {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x || y;
+  };
+};
+
+struct BitwiseAnd {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x & y;
+  };
+};
+
+struct BitwiseOr {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x | y;
+  };
+};
+
+struct BitwiseXor {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x ^ y;
+  };
+};
+
+struct LeftShift {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x << y;
+  };
+};
+
+struct RightShift {
+  template <typename T>
+  __device__ T operator()(T x, T y) {
+    return x >> y;
+  };
+};
+
+struct ArcTan2 {
+  template <typename T>
+  __device__ T operator()(T y, T x) {
+    return atan2f(y, x);
+  }
+};
+
+struct DivMod {
+  template <typename T>
+  __device__ cuda::std::array<T, 2> operator()(T x, T y) {
+    return {FloorDivide{}(x, y), Remainder{}(x, y)};
+  };
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/cast_op.cuh
+++ b/mlx/backend/cuda/device/cast_op.cuh
@ -0,0 +1,71 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <cuComplex.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace mlx::core::cu {
+
+// An op that does static_cast, with custom conversions for some types.
+template <typename SrcT, typename DstT, typename = void>
+struct CastOp {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, DstT>;
+
+  __device__ DstT operator()(SrcT x) {
+    return static_cast<DstT>(x);
+  }
+};
+
+// Converting a complex number to real number discards the imaginary part.
+template <typename DstT>
+struct CastOp<
+    cuComplex,
+    DstT,
+    cuda::std::enable_if_t<!cuda::std::is_same_v<cuComplex, DstT>>> {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<float, DstT>;
+
+  __device__ DstT operator()(cuComplex x) {
+    static_assert(!cuda::std::is_same_v<cuComplex, DstT>);
+    return static_cast<DstT>(cuCrealf(x));
+  }
+};
+
+// Allow converting a real number to complex number.
+template <typename SrcT>
+struct CastOp<
+    SrcT,
+    cuComplex,
+    cuda::std::enable_if_t<!cuda::std::is_same_v<SrcT, cuComplex>>> {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, float>;
+
+  __device__ cuComplex operator()(SrcT x) {
+    static_assert(!cuda::std::is_same_v<SrcT, cuComplex>);
+    return cuComplex{static_cast<float>(x), 0};
+  }
+};
+
+template <typename SrcT, typename DstT>
+struct CastOp<
+    SrcT,
+    DstT,
+    cuda::std::enable_if_t<cuda::std::is_same_v<SrcT, DstT>>> {
+  static constexpr bool is_castable = true;
+
+  __device__ SrcT operator()(SrcT x) {
+    return x;
+  }
+};
+
+// Return an iterator that cast the value to DstT using CastOp.
+template <typename DstT, typename Iterator>
+__host__ __device__ auto make_cast_iterator(Iterator it) {
+  using SrcT = typename cuda::std::iterator_traits<Iterator>::value_type;
+  if constexpr (std::is_same_v<SrcT, DstT>) {
+    return it;
+  } else {
+    return thrust::make_transform_iterator(it, CastOp<SrcT, DstT>{});
+  }
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/config.h
+++ b/mlx/backend/cuda/device/config.h
@ -0,0 +1,12 @@
+// Copyright © 2025 Apple Inc.
+
+// This file is used by both CUDA kernel code and host-only C++ code.
+
+#pragma once
+
+// The maximum dimensions of shape/strides passed as kernel parameters.
+#define MAX_NDIM 10
+
+// All existing NVIDIA hardware has a fixed 32 warp size. Though a built-in
+// warpSize variable exists, using it would prevent compile-time optimizations.
+#define WARP_SIZE 32
--- a/mlx/backend/cuda/device/cucomplex_math.cuh
+++ b/mlx/backend/cuda/device/cucomplex_math.cuh
@ -0,0 +1,240 @@
+// Copyright © 2025 Apple Inc.
+// Copyright © 2017-2024 The Simons Foundation, Inc.
+//
+// FINUFFT is licensed under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance with the
+// License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Forked from
+// https://github.com/flatironinstitute/finufft/blob/main/include/cufinufft/contrib/helper_math.h
+
+#pragma once
+
+#include <cuComplex.h>
+
+// This header provides some helper functions for cuComplex types.
+// It mainly wraps existing CUDA implementations to provide operator overloads
+// e.g. cuAdd, cuSub, cuMul, cuDiv, cuCreal, cuCimag, cuCabs, cuCarg, cuConj are
+// all provided by CUDA
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator+(const cuDoubleComplex& a, const cuDoubleComplex& b) {
+  return cuCadd(a, b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator-(const cuDoubleComplex& a, const cuDoubleComplex& b) {
+  return cuCsub(a, b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator*(const cuDoubleComplex& a, const cuDoubleComplex& b) {
+  return cuCmul(a, b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator/(const cuDoubleComplex& a, const cuDoubleComplex& b) {
+  return cuCdiv(a, b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator%(const cuDoubleComplex& a, const cuDoubleComplex& b) {
+  double r = cuCreal(a) - (floorf(cuCreal(a) / cuCreal(b)) * cuCreal(b));
+  double i = cuCimag(a) - (floorf(cuCimag(a) / cuCimag(b)) * cuCimag(b));
+  return make_cuDoubleComplex(r, i);
+}
+
+__forceinline__ __host__ __device__ bool operator==(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  return cuCreal(a) == cuCreal(b) && cuCimag(a) == cuCimag(b);
+}
+
+__forceinline__ __host__ __device__ bool operator!=(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  return !(a == b);
+}
+
+__forceinline__ __host__ __device__ bool operator>(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  double mag_a = sqrt(cuCreal(a) * cuCreal(a) + cuCimag(a) * cuCimag(a));
+  double mag_b = sqrt(cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b));
+  return mag_a > mag_b;
+}
+
+__forceinline__ __host__ __device__ bool operator>=(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  return a > b || a == b;
+}
+
+__forceinline__ __host__ __device__ bool operator<(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  return b > a;
+}
+
+__forceinline__ __host__ __device__ bool operator<=(
+    const cuDoubleComplex& a,
+    const cuDoubleComplex& b) {
+  return b > a || a == b;
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator+(const cuDoubleComplex& a, double b) {
+  return make_cuDoubleComplex(cuCreal(a) + b, cuCimag(a));
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator+(double a, const cuDoubleComplex& b) {
+  return make_cuDoubleComplex(a + cuCreal(b), cuCimag(b));
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator-(const cuDoubleComplex& a, double b) {
+  return make_cuDoubleComplex(cuCreal(a) - b, cuCimag(a));
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator-(double a, const cuDoubleComplex& b) {
+  return make_cuDoubleComplex(a - cuCreal(b), -cuCimag(b));
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator*(const cuDoubleComplex& a, double b) {
+  return make_cuDoubleComplex(cuCreal(a) * b, cuCimag(a) * b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator*(double a, const cuDoubleComplex& b) {
+  return make_cuDoubleComplex(a * cuCreal(b), a * cuCimag(b));
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator/(const cuDoubleComplex& a, double b) {
+  return make_cuDoubleComplex(cuCreal(a) / b, cuCimag(a) / b);
+}
+
+__forceinline__ __host__ __device__ cuDoubleComplex
+operator/(double a, const cuDoubleComplex& b) {
+  double denom = cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b);
+  return make_cuDoubleComplex(
+      (a * cuCreal(b)) / denom, (-a * cuCimag(b)) / denom);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator+(const cuFloatComplex& a, const cuFloatComplex& b) {
+  return cuCaddf(a, b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator-(const cuFloatComplex& a, const cuFloatComplex& b) {
+  return cuCsubf(a, b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator*(const cuFloatComplex& a, const cuFloatComplex& b) {
+  return cuCmulf(a, b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator/(const cuFloatComplex& a, const cuFloatComplex& b) {
+  return cuCdivf(a, b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator%(const cuFloatComplex& a, const cuFloatComplex& b) {
+  float r = cuCrealf(a) - (floorf(cuCrealf(a) / cuCrealf(b)) * cuCrealf(b));
+  float i = cuCimagf(a) - (floorf(cuCimagf(a) / cuCimagf(b)) * cuCimagf(b));
+  return make_cuFloatComplex(r, i);
+}
+
+__forceinline__ __host__ __device__ bool operator==(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  return cuCrealf(a) == cuCrealf(b) && cuCimagf(a) == cuCimagf(b);
+}
+
+__forceinline__ __host__ __device__ bool operator!=(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  return !(a == b);
+}
+
+__forceinline__ __host__ __device__ bool operator>(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  float mag_a = sqrt(cuCrealf(a) * cuCrealf(a) + cuCimagf(a) * cuCimagf(a));
+  float mag_b = sqrt(cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b));
+  return mag_a > mag_b;
+}
+
+__forceinline__ __host__ __device__ bool operator>=(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  return a > b || a == b;
+}
+
+__forceinline__ __host__ __device__ bool operator<(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  return b > a;
+}
+
+__forceinline__ __host__ __device__ bool operator<=(
+    const cuFloatComplex& a,
+    const cuFloatComplex& b) {
+  return b > a || a == b;
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator+(const cuFloatComplex& a, float b) {
+  return make_cuFloatComplex(cuCrealf(a) + b, cuCimagf(a));
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator+(float a, const cuFloatComplex& b) {
+  return make_cuFloatComplex(a + cuCrealf(b), cuCimagf(b));
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator-(const cuFloatComplex& a, float b) {
+  return make_cuFloatComplex(cuCrealf(a) - b, cuCimagf(a));
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator-(float a, const cuFloatComplex& b) {
+  return make_cuFloatComplex(a - cuCrealf(b), -cuCimagf(b));
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator*(const cuFloatComplex& a, float b) {
+  return make_cuFloatComplex(cuCrealf(a) * b, cuCimagf(a) * b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator*(float a, const cuFloatComplex& b) {
+  return make_cuFloatComplex(a * cuCrealf(b), a * cuCimagf(b));
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator/(const cuFloatComplex& a, float b) {
+  return make_cuFloatComplex(cuCrealf(a) / b, cuCimagf(a) / b);
+}
+
+__forceinline__ __host__ __device__ cuFloatComplex
+operator/(float a, const cuFloatComplex& b) {
+  float denom = cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b);
+  return make_cuFloatComplex(
+      (a * cuCrealf(b)) / denom, (-a * cuCimagf(b)) / denom);
+}
--- a/mlx/backend/cuda/device/fp16_math.cuh
+++ b/mlx/backend/cuda/device/fp16_math.cuh
@ -0,0 +1,194 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+namespace mlx::core::cu {
+
+///////////////////////////////////////////////////////////////////////////////
+// Unary ops for half types.
+///////////////////////////////////////////////////////////////////////////////
+
+#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
+#define MLX_DEFINE_UNARY_OP(NAME, HALF_OP)           \
+  template <typename T>                              \
+  __forceinline__ __device__ auto NAME(T x) {        \
+    if constexpr (cuda::std::is_same_v<T, __half>) { \
+      return HALF_OP(x);                             \
+    } else {                                         \
+      return ::NAME(x);                              \
+    }                                                \
+  }
+#else
+#define MLX_DEFINE_UNARY_OP(NAME, HALF_OP)                         \
+  template <typename T>                                            \
+  __forceinline__ __device__ auto NAME(T x) {                      \
+    if constexpr (cuda::std::is_same_v<T, __half>) {               \
+      return HALF_OP(x);                                           \
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) { \
+      return HALF_OP(x);                                           \
+    } else {                                                       \
+      return ::NAME(x);                                            \
+    }                                                              \
+  }
+#endif
+
+#define MLX_DEFINE_UNARY_OP_FALLBCK(NAME)                          \
+  template <typename T>                                            \
+  __forceinline__ __device__ auto NAME(T x) {                      \
+    if constexpr (cuda::std::is_same_v<T, __half>) {               \
+      return ::NAME(__half2float(x));                              \
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) { \
+      return ::NAME(__bfloat162float(x));                          \
+    } else {                                                       \
+      return ::NAME(x);                                            \
+    }                                                              \
+  }
+
+MLX_DEFINE_UNARY_OP(abs, __habs)
+MLX_DEFINE_UNARY_OP(ceil, hceil)
+MLX_DEFINE_UNARY_OP(cos, hcos)
+MLX_DEFINE_UNARY_OP(exp, hexp)
+MLX_DEFINE_UNARY_OP(floor, hfloor)
+MLX_DEFINE_UNARY_OP(isnan, __hisnan)
+MLX_DEFINE_UNARY_OP(log, hlog)
+MLX_DEFINE_UNARY_OP(log2, hlog2)
+MLX_DEFINE_UNARY_OP(log10, hlog10)
+MLX_DEFINE_UNARY_OP(rint, hrint)
+MLX_DEFINE_UNARY_OP(rsqrt, hrsqrt)
+MLX_DEFINE_UNARY_OP(sin, hsin)
+MLX_DEFINE_UNARY_OP(sqrt, hsqrt)
+MLX_DEFINE_UNARY_OP_FALLBCK(acos)
+MLX_DEFINE_UNARY_OP_FALLBCK(acosh)
+MLX_DEFINE_UNARY_OP_FALLBCK(asin)
+MLX_DEFINE_UNARY_OP_FALLBCK(asinh)
+MLX_DEFINE_UNARY_OP_FALLBCK(atan)
+MLX_DEFINE_UNARY_OP_FALLBCK(atanh)
+MLX_DEFINE_UNARY_OP_FALLBCK(cosh)
+MLX_DEFINE_UNARY_OP_FALLBCK(log1p)
+MLX_DEFINE_UNARY_OP_FALLBCK(sinh)
+MLX_DEFINE_UNARY_OP_FALLBCK(tan)
+#if __CUDA_ARCH__ >= 1280
+MLX_DEFINE_UNARY_OP(tanh, htanh)
+#else
+MLX_DEFINE_UNARY_OP_FALLBCK(tanh)
+#endif
+
+#undef MLX_DEFINE_UNARY_OP
+#undef MLX_DEFINE_UNARY_OP_FALLBCK
+
+///////////////////////////////////////////////////////////////////////////////
+// Binary ops for half types.
+///////////////////////////////////////////////////////////////////////////////
+
+#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
+#define MLX_DEFINE_BINARY_OP(NAME, HALF_OP)          \
+  template <typename T>                              \
+  __forceinline__ __device__ auto NAME(T x, T y) {   \
+    if constexpr (cuda::std::is_same_v<T, __half>) { \
+      return HALF_OP(x, y);                          \
+    } else {                                         \
+      return ::NAME(x, y);                           \
+    }                                                \
+  }
+#else
+#define MLX_DEFINE_BINARY_OP(NAME, HALF_OP)                        \
+  template <typename T>                                            \
+  __forceinline__ __device__ auto NAME(T x, T y) {                 \
+    if constexpr (cuda::std::is_same_v<T, __half>) {               \
+      return HALF_OP(x, y);                                        \
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) { \
+      return HALF_OP(x, y);                                        \
+    } else {                                                       \
+      return ::NAME(x, y);                                         \
+    }                                                              \
+  }
+#endif
+
+MLX_DEFINE_BINARY_OP(max, __hmax)
+MLX_DEFINE_BINARY_OP(min, __hmin)
+
+#undef MLX_DEFINE_BINARY_OP
+
+template <typename T>
+__forceinline__ __device__ T fmod(T x, T y) {
+  if constexpr (cuda::std::is_same_v<T, __half>) {
+    return __float2half(::fmod(__half2float(x), __half2float(y)));
+#if CUDART_VERSION >= 12000 || __CUDA_ARCH__ >= 800
+  } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
+    return __float2bfloat16(::fmod(__bfloat162float(x), __bfloat162float(y)));
+#endif
+  } else {
+    return ::fmod(x, y);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Additional C++ operator overrides between half types and native types.
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename U>
+constexpr bool is_integral_except =
+    cuda::std::is_integral_v<T> && !cuda::std::is_same_v<T, U>;
+
+template <typename T, typename U>
+constexpr bool is_arithmetic_except =
+    cuda::std::is_arithmetic_v<T> && !cuda::std::is_same_v<T, U>;
+
+#define MLX_DEFINE_HALF_OP(HALF, HALF2FLOAT, FLOAT2HALF, OP)          \
+  template <                                                          \
+      typename T,                                                     \
+      typename = cuda::std::enable_if_t<is_integral_except<T, HALF>>> \
+  __forceinline__ __device__ HALF operator OP(HALF x, T y) {          \
+    return FLOAT2HALF(HALF2FLOAT(x) OP static_cast<float>(y));        \
+  }                                                                   \
+  template <                                                          \
+      typename T,                                                     \
+      typename = cuda::std::enable_if_t<is_integral_except<T, HALF>>> \
+  __forceinline__ __device__ HALF operator OP(T x, HALF y) {          \
+    return FLOAT2HALF(static_cast<float>(x) OP HALF2FLOAT(y));        \
+  }
+
+#define MLX_DEFINE_HALF_CMP(HALF, HALF2FLOAT, OP)                       \
+  template <                                                            \
+      typename T,                                                       \
+      typename = cuda::std::enable_if_t<is_arithmetic_except<T, HALF>>> \
+  __forceinline__ __device__ bool operator OP(HALF x, T y) {            \
+    return HALF2FLOAT(x) OP static_cast<float>(y);                      \
+  }                                                                     \
+  template <                                                            \
+      typename T,                                                       \
+      typename = cuda::std::enable_if_t<is_arithmetic_except<T, HALF>>> \
+  __forceinline__ __device__ bool operator OP(T x, HALF y) {            \
+    return static_cast<float>(y) OP HALF2FLOAT(x);                      \
+  }
+
+MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, +)
+MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, -)
+MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, *)
+MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, /)
+MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, +)
+MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, -)
+MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, *)
+MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, /)
+MLX_DEFINE_HALF_CMP(__half, __half2float, <)
+MLX_DEFINE_HALF_CMP(__half, __half2float, >)
+MLX_DEFINE_HALF_CMP(__half, __half2float, <=)
+MLX_DEFINE_HALF_CMP(__half, __half2float, >=)
+MLX_DEFINE_HALF_CMP(__half, __half2float, ==)
+MLX_DEFINE_HALF_CMP(__half, __half2float, !=)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, <)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, >)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, <=)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, >=)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, ==)
+MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, !=)
+
+#undef MLX_DEFINE_HALF_OP
+#undef MLX_DEFINE_HALF_CMP
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/gather.cuh
+++ b/mlx/backend/cuda/device/gather.cuh
@ -0,0 +1,53 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device/indexing.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core::cu {
+
+namespace cg = cooperative_groups;
+
+template <typename T, typename IdxT, int NIDX, int IDX_NDIM, typename LocT>
+__global__ void gather(
+    const T* src,
+    T* out,
+    LocT size,
+    const __grid_constant__ Shape src_shape,
+    const __grid_constant__ Strides src_strides,
+    int32_t src_ndim,
+    const __grid_constant__ Shape slice_sizes,
+    uint32_t slice_size,
+    const __grid_constant__ cuda::std::array<int32_t, NIDX> axes,
+    const __grid_constant__ cuda::std::array<IdxT*, NIDX> indices,
+    const __grid_constant__ cuda::std::array<int32_t, NIDX * IDX_NDIM>
+        indices_shape,
+    const __grid_constant__ cuda::std::array<int64_t, NIDX * IDX_NDIM>
+        indices_strides) {
+  LocT out_idx = cg::this_grid().thread_rank();
+  if (out_idx >= size) {
+    return;
+  }
+
+  LocT src_elem = out_idx % slice_size;
+  LocT idx_elem = out_idx / slice_size;
+
+  LocT src_loc =
+      elem_to_loc(src_elem, slice_sizes.data(), src_strides.data(), src_ndim);
+
+#pragma unroll
+  for (int i = 0; i < NIDX; ++i) {
+    LocT idx_loc = elem_to_loc_nd<IDX_NDIM>(
+        idx_elem,
+        indices_shape.data() + i * IDX_NDIM,
+        indices_strides.data() + i * IDX_NDIM);
+    int32_t axis = axes[i];
+    LocT idx_val = absolute_index(indices[i][idx_loc], src_shape[axis]);
+    src_loc += idx_val * src_strides[axis];
+  }
+
+  out[out_idx] = src[src_loc];
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/gather_axis.cuh
+++ b/mlx/backend/cuda/device/gather_axis.cuh
@ -0,0 +1,65 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device/indexing.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core::cu {
+
+namespace cg = cooperative_groups;
+
+template <
+    typename T,
+    typename IdxT,
+    int NDIM,
+    bool SrcC,
+    bool IdxC,
+    typename LocT>
+__global__ void gather_axis(
+    const T* src,
+    const IdxT* indices,
+    T* out,
+    LocT idx_size_pre,
+    LocT idx_size_axis,
+    LocT idx_size_post,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> src_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> idx_strides,
+    int32_t axis,
+    int32_t axis_size,
+    int64_t src_stride_axis,
+    int64_t idx_stride_axis) {
+  LocT index = cg::this_grid().thread_rank();
+  if (index >= idx_size_pre * idx_size_axis * idx_size_post) {
+    return;
+  }
+
+  auto [x, y, z] = index_to_dims(index, idx_size_axis, idx_size_pre);
+
+  LocT elem_idx = z * idx_size_post;
+
+  LocT idx_loc = y * idx_stride_axis;
+  if constexpr (IdxC) {
+    idx_loc += elem_idx * idx_size_axis + x;
+  } else {
+    idx_loc +=
+        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), idx_strides.data());
+  }
+
+  auto idx_val = absolute_index(indices[idx_loc], axis_size);
+
+  LocT src_loc = idx_val * src_stride_axis;
+  if constexpr (SrcC) {
+    src_loc += elem_idx * axis_size + x;
+  } else {
+    src_loc +=
+        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), src_strides.data());
+  }
+
+  LocT out_idx = y * idx_size_post + elem_idx * idx_size_axis + x;
+
+  out[out_idx] = src[src_loc];
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/indexing.cuh
+++ b/mlx/backend/cuda/device/indexing.cuh
@ -0,0 +1,30 @@
+// Copyright © 2025 Apple Inc.
+
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+
+namespace mlx::core::cu {
+
+// Convert an absolute index to positions in a 3d grid, assuming the index is
+// calculated with:
+// index = x * dim1 * dim2 + y * dim2 + z
+template <typename T>
+inline __host__ __device__ cuda::std::tuple<T, T, T>
+index_to_dims(T index, T dim1, T dim2) {
+  T x = index / (dim1 * dim2);
+  T y = (index % (dim1 * dim2)) / dim2;
+  T z = index % dim2;
+  return cuda::std::make_tuple(x, y, z);
+}
+
+// Get absolute index from possible negative index.
+template <typename IdxT>
+inline __host__ __device__ auto absolute_index(IdxT idx, int32_t size) {
+  if constexpr (cuda::std::is_unsigned_v<IdxT>) {
+    return idx;
+  } else {
+    return static_cast<int32_t>(idx < 0 ? idx + size : idx);
+  }
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/scatter.cuh
+++ b/mlx/backend/cuda/device/scatter.cuh
@ -0,0 +1,68 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device/indexing.cuh"
+#include "mlx/backend/cuda/device/scatter_ops.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core::cu {
+
+namespace cg = cooperative_groups;
+
+template <
+    typename T,
+    typename IdxT,
+    typename Op,
+    int NIDX,
+    int IDX_NDIM,
+    typename LocT>
+__global__ void scatter(
+    const T* upd,
+    T* out,
+    LocT size,
+    const __grid_constant__ Shape upd_shape,
+    const __grid_constant__ Strides upd_strides,
+    int32_t upd_ndim,
+    LocT upd_post_idx_size,
+    const __grid_constant__ Shape out_shape,
+    const __grid_constant__ Strides out_strides,
+    int32_t out_ndim,
+    const __grid_constant__ cuda::std::array<int32_t, NIDX> axes,
+    const __grid_constant__ cuda::std::array<IdxT*, NIDX> indices,
+    const __grid_constant__ cuda::std::array<int32_t, NIDX * IDX_NDIM>
+        indices_shape,
+    const __grid_constant__ cuda::std::array<int64_t, NIDX * IDX_NDIM>
+        indices_strides) {
+  LocT upd_idx = cg::this_grid().thread_rank();
+  if (upd_idx >= size) {
+    return;
+  }
+
+  LocT out_elem = upd_idx % upd_post_idx_size;
+  LocT idx_elem = upd_idx / upd_post_idx_size;
+
+  LocT out_idx = elem_to_loc(
+      out_elem, upd_shape.data() + IDX_NDIM, out_strides.data(), out_ndim);
+
+#pragma unroll
+  for (int i = 0; i < NIDX; ++i) {
+    LocT idx_loc = elem_to_loc_nd<IDX_NDIM>(
+        idx_elem,
+        indices_shape.data() + i * IDX_NDIM,
+        indices_strides.data() + i * IDX_NDIM);
+    int32_t axis = axes[i];
+    LocT idx_val = absolute_index(indices[i][idx_loc], out_shape[axis]);
+    out_idx += idx_val * out_strides[axis];
+  }
+
+  LocT upd_loc = elem_to_loc(
+      out_elem + idx_elem * upd_post_idx_size,
+      upd_shape.data(),
+      upd_strides.data(),
+      upd_ndim);
+
+  Op{}(out + out_idx, upd[upd_loc]);
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/scatter_axis.cuh
+++ b/mlx/backend/cuda/device/scatter_axis.cuh
@ -0,0 +1,67 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device/indexing.cuh"
+#include "mlx/backend/cuda/device/scatter_ops.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core::cu {
+
+namespace cg = cooperative_groups;
+
+template <
+    typename T,
+    typename IdxT,
+    typename Op,
+    int NDIM,
+    bool UpdC,
+    bool IdxC,
+    typename LocT>
+__global__ void scatter_axis(
+    const T* upd,
+    const IdxT* indices,
+    T* out,
+    LocT idx_size_pre,
+    LocT idx_size_axis,
+    LocT idx_size_post,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> upd_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> idx_strides,
+    int32_t axis,
+    int32_t axis_size,
+    int64_t upd_stride_axis,
+    int64_t idx_stride_axis) {
+  LocT index = cg::this_grid().thread_rank();
+  if (index >= idx_size_pre * idx_size_axis * idx_size_post) {
+    return;
+  }
+
+  auto [x, y, z] = index_to_dims(index, idx_size_axis, idx_size_pre);
+
+  LocT elem_idx = z * idx_size_post;
+
+  LocT idx_loc = y * idx_stride_axis;
+  if constexpr (IdxC) {
+    idx_loc += elem_idx * idx_size_axis + x;
+  } else {
+    idx_loc +=
+        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), idx_strides.data());
+  }
+
+  auto idx_val = absolute_index(indices[idx_loc], axis_size);
+
+  LocT upd_loc = y * upd_stride_axis;
+  if constexpr (UpdC) {
+    upd_loc += elem_idx * idx_size_axis + x;
+  } else {
+    upd_loc +=
+        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), upd_strides.data());
+  }
+
+  LocT out_idx = idx_val * idx_size_post + elem_idx * axis_size + x;
+
+  Op{}(out + out_idx, upd[upd_loc]);
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/scatter_ops.cuh
+++ b/mlx/backend/cuda/device/scatter_ops.cuh
@ -0,0 +1,44 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/device/atomic_ops.cuh"
+
+namespace mlx::core::cu {
+
+struct ScatterAssign {
+  template <typename T>
+  __device__ void operator()(T* out, T val) const {
+    *out = val;
+  }
+};
+
+struct ScatterSum {
+  template <typename T>
+  __device__ void operator()(T* out, T val) const {
+    atomic_add(out, val);
+  }
+};
+
+struct ScatterProd {
+  template <typename T>
+  __device__ void operator()(T* out, T val) const {
+    atomic_prod(out, val);
+  }
+};
+
+struct ScatterMax {
+  template <typename T>
+  __device__ void operator()(T* out, T val) const {
+    atomic_max(out, val);
+  }
+};
+
+struct ScatterMin {
+  template <typename T>
+  __device__ void operator()(T* out, T val) const {
+    atomic_min(out, val);
+  }
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/ternary_ops.cuh
+++ b/mlx/backend/cuda/device/ternary_ops.cuh
@ -0,0 +1,13 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+
+namespace mlx::core::cu {
+
+struct Select {
+  template <typename T>
+  __device__ T operator()(bool condition, T x, T y) {
+    return condition ? x : y;
+  }
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/unary_ops.cuh
+++ b/mlx/backend/cuda/device/unary_ops.cuh
@ -0,0 +1,368 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/device/fp16_math.cuh"
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <math_constants.h>
+
+namespace mlx::core::cu {
+
+struct Abs {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_unsigned_v<T>) {
+      return x;
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {sqrt(cuCrealf(x) * cuCrealf(x) + cuCimagf(x) * cuCimagf(x)), 0};
+    } else {
+      return abs(x);
+    }
+  }
+};
+
+struct ArcCos {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return acos(x);
+  }
+};
+
+struct ArcCosh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return acosh(x);
+  }
+};
+
+struct ArcSin {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return asin(x);
+  }
+};
+
+struct ArcSinh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return asinh(x);
+  }
+};
+
+struct ArcTan {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return atan(x);
+  }
+};
+
+struct ArcTanh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return atanh(x);
+  }
+};
+
+struct BitwiseInvert {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return ~x;
+  }
+};
+
+struct Ceil {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      return x;
+    } else {
+      return ceil(x);
+    }
+  }
+};
+
+struct Conjugate {
+  __device__ cuComplex operator()(cuComplex x) {
+    return {cuCrealf(x), -cuCimagf(x)};
+  }
+};
+
+struct Cos {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {
+          cos(cuCrealf(x)) * cosh(cuCimagf(x)),
+          -sin(cuCrealf(x)) * sinh(cuCimagf(x))};
+    } else {
+      return cos(x);
+    }
+  }
+};
+
+struct Cosh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {
+          cosh(cuCrealf(x)) * cos(cuCimagf(x)),
+          sinh(cuCrealf(x)) * sin(cuCimagf(x))};
+    } else {
+      return cosh(x);
+    }
+  }
+};
+
+struct Erf {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, __half>) {
+      return erf(__half2float(x));
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
+      return erf(__bfloat162float(x));
+    } else {
+      return erf(x);
+    }
+  }
+};
+
+struct ErfInv {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, __half>) {
+      return erfinv(__half2float(x));
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
+      return erfinv(__bfloat162float(x));
+    } else {
+      return erfinv(x);
+    }
+  }
+};
+
+struct Exp {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      auto m = exp(cuCrealf(x));
+      return {m * cos(cuCimagf(x)), m * sinh(cuCimagf(x))};
+    } else {
+      return exp(x);
+    }
+  }
+};
+
+struct Expm1 {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, __half>) {
+      return expm1(__half2float(x));
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
+      return expm1(__bfloat162float(x));
+    } else {
+      return expm1(x);
+    }
+  }
+};
+
+struct Floor {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_integral_v<T>) {
+      return x;
+    } else {
+      return floor(x);
+    }
+  }
+};
+
+struct Imag {
+  __device__ float operator()(cuComplex x) {
+    return cuCimagf(x);
+  }
+};
+
+struct Log {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      auto r = log(cuCrealf(Abs{}(x)));
+      auto i = atan2f(cuCimagf(x), cuCrealf(x));
+      return {r, i};
+    } else {
+      return log(x);
+    }
+  }
+};
+
+struct Log2 {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      auto y = Log{}(x);
+      return {cuCrealf(y) / CUDART_LN2_F, cuCimagf(y) / CUDART_LN2_F};
+    } else {
+      return log2(x);
+    }
+  }
+};
+
+struct Log10 {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      auto y = Log{}(x);
+      return {cuCrealf(y) / CUDART_LNT_F, cuCimagf(y) / CUDART_LNT_F};
+      return y;
+    } else {
+      return log10(x);
+    }
+  }
+};
+
+struct Log1p {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return log1p(x);
+  }
+};
+
+struct LogicalNot {
+  __device__ bool operator()(bool x) {
+    return !x;
+  }
+};
+
+struct Negative {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return 0 - x;
+    } else {
+      return -x;
+    }
+  }
+};
+
+struct Real {
+  __device__ float operator()(cuComplex x) {
+    return cuCrealf(x);
+  }
+};
+
+struct Round {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {rint(cuCrealf(x)), rint(cuCimagf(x))};
+    } else {
+      return rint(x);
+    }
+  }
+};
+
+struct Rsqrt {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return rsqrt(x);
+  }
+};
+
+struct Sigmoid {
+  template <typename T>
+  __device__ T operator()(T x) {
+    T y = 1 / (1 + exp(-abs(x)));
+    return (x < 0) ? 1 - y : y;
+  }
+};
+
+struct Sign {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_unsigned_v<T>) {
+      return x != 0;
+    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      if (cuCrealf(x) == 0 && cuCimagf(x) == 0) {
+        return x;
+      } else {
+        return x / Abs()(x);
+      }
+    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
+      return static_cast<float>((x > T(0.f)) - (x < T(0.f)));
+    } else {
+      return (x > T(0)) - (x < T(0));
+    }
+  }
+};
+
+struct Sin {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {
+          sin(cuCrealf(x)) * cosh(cuCimagf(x)),
+          cos(cuCrealf(x)) * sinh(cuCimagf(x))};
+    } else {
+      return sin(x);
+    }
+  }
+};
+
+struct Sinh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      return {
+          sinh(cuCrealf(x)) * cos(cuCimagf(x)),
+          cosh(cuCrealf(x)) * sin(cuCimagf(x))};
+    } else {
+      return sinh(x);
+    }
+  }
+};
+
+struct Square {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return x * x;
+  }
+};
+
+struct Sqrt {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return sqrt(x);
+  }
+};
+
+struct Tan {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      float tan_a = tan(cuCrealf(x));
+      float tanh_b = tanh(cuCimagf(x));
+      float t1 = tan_a * tanh_b;
+      float denom = 1. + t1 * t1;
+      return {(tan_a - tanh_b * t1) / denom, (tanh_b + tan_a * t1) / denom};
+    } else {
+      return tan(x);
+    }
+  }
+};
+
+struct Tanh {
+  template <typename T>
+  __device__ T operator()(T x) {
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      float tanh_a = tanh(cuCrealf(x));
+      float tan_b = tan(cuCimagf(x));
+      float t1 = tanh_a * tan_b;
+      float denom = 1. + t1 * t1;
+      return {(tanh_a + tan_b * t1) / denom, (tan_b - tanh_a * t1) / denom};
+    } else {
+      return tanh(x);
+    }
+  }
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/utils.cuh
+++ b/mlx/backend/cuda/device/utils.cuh
@ -0,0 +1,358 @@
+// Copyright © 2025 Apple Inc.
+
+// This file must not include any host-only code, utilies that work under both
+// host and device can be put here.
+//
+// See more about the requirements at:
+// https://docs.nvidia.com/cuda/nvrtc/#language
+
+#pragma once
+
+#include "mlx/backend/cuda/device/config.h"
+
+#include <cuComplex.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda/std/array>
+#include <cuda/std/limits>
+#include <cuda/std/tuple>
+
+namespace mlx::core::cu {
+
+///////////////////////////////////////////////////////////////////////////////
+// CUDA kernel utils
+///////////////////////////////////////////////////////////////////////////////
+
+// To pass shape/strides to kernels via constant memory, their size must be
+// known at compile time.
+using Shape = cuda::std::array<int32_t, MAX_NDIM>;
+using Strides = cuda::std::array<int64_t, MAX_NDIM>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Type limits utils
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename = void>
+struct Limits {
+  static constexpr __host__ __device__ T max() {
+    return cuda::std::numeric_limits<T>::max();
+  }
+  static constexpr __host__ __device__ T min() {
+    return cuda::std::numeric_limits<T>::min();
+  }
+  static constexpr __host__ __device__ T finite_max() {
+    return cuda::std::numeric_limits<T>::max();
+  }
+  static constexpr __host__ __device__ T finite_min() {
+    return cuda::std::numeric_limits<T>::min();
+  }
+};
+
+template <typename T>
+struct Limits<
+    T,
+    cuda::std::enable_if_t<
+        cuda::std::is_same_v<T, float> || cuda::std::is_same_v<T, double>>> {
+  static constexpr __host__ __device__ T max() {
+    return cuda::std::numeric_limits<T>::infinity();
+  }
+  static constexpr __host__ __device__ T min() {
+    return -cuda::std::numeric_limits<T>::infinity();
+  }
+  static constexpr __host__ __device__ T finite_max() {
+    return cuda::std::numeric_limits<T>::max();
+  }
+  static constexpr __host__ __device__ T finite_min() {
+    return cuda::std::numeric_limits<T>::lowest();
+  }
+};
+
+// CUDA 11 does not have host side arithmatic operators for half types.
+template <typename T>
+struct Limits<
+    T,
+    cuda::std::enable_if_t<
+        cuda::std::is_same_v<T, __half> ||
+        cuda::std::is_same_v<T, __nv_bfloat16>>> {
+  static constexpr __host__ __device__ T max() {
+    return cuda::std::numeric_limits<T>::infinity();
+  }
+  static constexpr __host__ __device__ T min() {
+#if defined(__CUDA_ARCH__) || CUDART_VERSION >= 12000
+    return -cuda::std::numeric_limits<T>::infinity();
+#else
+    return -cuda::std::numeric_limits<float>::infinity();
+#endif
+  }
+  static constexpr __host__ __device__ T finite_max() {
+    return cuda::std::numeric_limits<T>::max();
+  }
+  static constexpr __host__ __device__ T finite_min() {
+#if defined(__CUDA_ARCH__) || CUDART_VERSION >= 12000
+    return cuda::std::numeric_limits<T>::lowest();
+#else
+    return cuda::std::numeric_limits<float>::lowest();
+#endif
+  }
+};
+
+template <>
+struct Limits<bool> {
+  static constexpr __host__ __device__ bool max() {
+    return true;
+  }
+  static constexpr __host__ __device__ bool min() {
+    return false;
+  }
+};
+
+template <>
+struct Limits<cuComplex> {
+  static constexpr __host__ __device__ cuComplex max() {
+    return {Limits<float>::max(), Limits<float>::max()};
+  }
+  static constexpr __host__ __device__ cuComplex min() {
+    return {Limits<float>::min(), Limits<float>::min()};
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Indexing utils
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename IdxT = int64_t>
+inline __host__ __device__ IdxT
+elem_to_loc(IdxT elem, const int* shape, const int64_t* strides, int ndim) {
+  IdxT loc = 0;
+  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
+    loc += (elem % shape[i]) * IdxT(strides[i]);
+    elem /= shape[i];
+  }
+  return loc;
+}
+
+// Optimize when the ndim is known at compile time.
+template <int NDIM, typename IdxT = int64_t>
+inline __host__ __device__ IdxT
+elem_to_loc_nd(IdxT elem, const int* shape, const int64_t* strides) {
+  IdxT loc = 0;
+#pragma unroll
+  for (int i = NDIM - 1; i >= 0; --i) {
+    loc += (elem % shape[i]) * IdxT(strides[i]);
+    elem /= shape[i];
+  }
+  return loc;
+}
+
+template <int NDIM, typename IdxT = int64_t>
+inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc_nd(
+    IdxT elem,
+    const int* shape,
+    const int64_t* a_strides,
+    const int64_t* b_strides) {
+  IdxT a_loc = 0;
+  IdxT b_loc = 0;
+#pragma unroll
+  for (int i = NDIM - 1; i >= 0; --i) {
+    int dim_idx = elem % shape[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
+    elem /= shape[i];
+  }
+  return cuda::std::make_tuple(a_loc, b_loc);
+}
+
+template <int NDIM, typename IdxT = int64_t>
+inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_nd(
+    IdxT elem,
+    const int* shape,
+    const int64_t* a_strides,
+    const int64_t* b_strides,
+    const int64_t* c_strides) {
+  IdxT a_loc = 0;
+  IdxT b_loc = 0;
+  IdxT c_loc = 0;
+#pragma unroll
+  for (int i = NDIM - 1; i >= 0; --i) {
+    int dim_idx = elem % shape[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
+    c_loc += dim_idx * IdxT(c_strides[i]);
+    elem /= shape[i];
+  }
+  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
+}
+
+// Optimized version when ndim is larger than 4.
+template <typename IdxT = int64_t>
+inline __host__ __device__ IdxT
+elem_to_loc_4d(IdxT elem, const int* shape, const int64_t* strides, int ndim) {
+  IdxT loc = 0;
+  for (int i = ndim - 1; i >= 0; --i) {
+    loc += (elem % shape[i]) * IdxT(strides[i]);
+    elem /= shape[i];
+  }
+  return loc;
+}
+
+template <typename IdxT = int64_t>
+inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc_4d(
+    IdxT elem,
+    const int* shape,
+    const int64_t* a_strides,
+    const int64_t* b_strides,
+    int ndim) {
+  IdxT a_loc = 0;
+  IdxT b_loc = 0;
+  for (int i = ndim - 1; i >= 0; --i) {
+    int dim_idx = elem % shape[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
+    elem /= shape[i];
+  }
+  return cuda::std::make_tuple(a_loc, b_loc);
+}
+
+template <typename IdxT = int64_t>
+inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_4d(
+    IdxT elem,
+    const int* shape,
+    const int64_t* a_strides,
+    const int64_t* b_strides,
+    const int64_t* c_strides,
+    int ndim) {
+  IdxT a_loc = 0;
+  IdxT b_loc = 0;
+  IdxT c_loc = 0;
+  for (int i = ndim - 1; i >= 0; --i) {
+    int dim_idx = elem % shape[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
+    c_loc += dim_idx * IdxT(c_strides[i]);
+    elem /= shape[i];
+  }
+  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Elem to loc in a loop utils
+///////////////////////////////////////////////////////////////////////////////
+
+template <int DIM, bool General = true, typename OffsetT = size_t>
+struct LoopedElemToLoc {
+  int dim;
+  LoopedElemToLoc<DIM - 1, General, OffsetT> inner_looper;
+  OffsetT offset{0};
+  int index{0};
+
+  __device__ LoopedElemToLoc(int dim) : dim(dim), inner_looper(dim - 1) {}
+
+  __device__ void next(const int* shape, const int64_t* strides) {
+    if (dim == 0) {
+      return;
+    }
+    index++;
+    offset += OffsetT(strides[dim - 1]);
+    if (index >= shape[dim - 1]) {
+      index = 0;
+      inner_looper.next(shape, strides);
+      offset = inner_looper.offset;
+    }
+  }
+
+  __device__ void next(int n, const int* shape, const int64_t* strides) {
+    if (dim == 0) {
+      return;
+    }
+    index += n;
+    offset += n * OffsetT(strides[dim - 1]);
+
+    if (index >= shape[dim - 1]) {
+      int extra = index - shape[dim - 1];
+      if (extra >= shape[dim - 1]) {
+        inner_looper.next(1 + extra / shape[dim - 1], shape, strides);
+        extra = extra % shape[dim - 1];
+      } else {
+        inner_looper.next(shape, strides);
+      }
+      index = 0;
+      offset = inner_looper.offset;
+      if (extra > 0) {
+        next(extra, shape, strides);
+      }
+    }
+  }
+
+  __device__ OffsetT location() {
+    return offset;
+  }
+};
+
+template <typename OffsetT>
+struct LoopedElemToLoc<1, true, OffsetT> {
+  int dim;
+  OffsetT offset{0};
+  int index{0};
+
+  __device__ LoopedElemToLoc(int dim) : dim(dim) {}
+
+  __device__ void next(const int* shape, const int64_t* strides) {
+    index++;
+    if (dim > 1) {
+      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
+    } else {
+      offset += OffsetT(strides[0]);
+    }
+  }
+
+  __device__ void next(int n, const int* shape, const int64_t* strides) {
+    index += n;
+    if (dim > 1) {
+      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
+    } else {
+      offset = index * OffsetT(strides[0]);
+    }
+  }
+
+  __device__ OffsetT location() {
+    return offset;
+  }
+};
+
+template <typename OffsetT>
+struct LoopedElemToLoc<1, false, OffsetT> {
+  OffsetT offset{0};
+
+  __device__ LoopedElemToLoc(int) {}
+
+  __device__ void next(const int*, const int64_t* strides) {
+    offset += OffsetT(strides[0]);
+  }
+
+  __device__ void next(int n, const int*, const int64_t* strides) {
+    offset += n * OffsetT(strides[0]);
+  }
+
+  __device__ OffsetT location() {
+    return offset;
+  }
+};
+
+inline __device__ cuComplex log1p(cuComplex in) {
+  float x = cuCrealf(in);
+  float y = cuCimagf(in);
+  float zabs = sqrt(x * x + y * y);
+  float theta = atan2f(y, x + 1);
+  if (zabs < 0.5f) {
+    float r = x * (2 + x) + y * y;
+    if (r == 0) { // handle underflow
+      return {x, theta};
+    }
+    return {0.5f * log1pf(r), theta};
+  } else {
+    auto z0 = sqrt((x + 1) * (x + 1) + y * y);
+    return {log(z0), theta};
+  }
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@ -0,0 +1,68 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/gpu/eval.h"
+#include "mlx/backend/cuda/allocator.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/gpu/available.h"
+#include "mlx/primitives.h"
+
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core::gpu {
+
+bool is_available() {
+  return true;
+}
+
+void new_stream(Stream s) {
+  // Force initalization of cuda, so cuda runtime get destroyed at last.
+  cudaFree(nullptr);
+  // Ensure the static stream objects get created.
+  cu::get_command_encoder(s);
+  // The main thread is safe to free buffers.
+  cu::allocator().register_this_thread();
+}
+
+void eval(array& arr) {
+  nvtx3::scoped_range r("gpu::eval");
+  auto outputs = arr.outputs();
+  {
+    // If the array is a tracer hold a reference
+    // to its inputs so they don't get donated
+    std::vector<array> inputs;
+    if (arr.is_tracer()) {
+      inputs = arr.inputs();
+    }
+    arr.primitive().eval_gpu(arr.inputs(), outputs);
+  }
+
+  auto& encoder = cu::get_command_encoder(arr.primitive().stream());
+  if (encoder.has_gpu_work()) {
+    // Keep used buffers alive until kernel finishes running.
+    std::unordered_set<std::shared_ptr<array::Data>> buffers;
+    for (auto& in : arr.inputs()) {
+      buffers.insert(in.data_shared_ptr());
+    }
+    for (auto& s : arr.siblings()) {
+      buffers.insert(s.data_shared_ptr());
+    }
+    // Remove the output if it was donated to by an input.
+    if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
+      buffers.erase(it);
+    }
+    encoder.add_completed_handler([buffers = std::move(buffers)]() {});
+  }
+  encoder.end_encoding();
+}
+
+void finalize(Stream s) {
+  nvtx3::scoped_range r("gpu::finalize");
+  cu::get_command_encoder(s).commit();
+}
+
+void synchronize(Stream s) {
+  nvtx3::scoped_range r("gpu::synchronize");
+  cu::get_command_encoder(s).synchronize();
+}
+
+} // namespace mlx::core::gpu
--- a/mlx/backend/cuda/event.cu
+++ b/mlx/backend/cuda/event.cu
@ -0,0 +1,269 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/cuda/allocator.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/event.h"
+#include "mlx/backend/cuda/utils.h"
+#include "mlx/event.h"
+#include "mlx/scheduler.h"
+
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace cu {
+
+///////////////////////////////////////////////////////////////////////////////
+// CudaEvent implementations
+///////////////////////////////////////////////////////////////////////////////
+
+// Cuda event managed with RAII.
+class CudaEventHandle {
+ public:
+  CudaEventHandle() {
+    CHECK_CUDA_ERROR(cudaEventCreateWithFlags(
+        &event_, cudaEventDisableTiming | cudaEventBlockingSync));
+  }
+
+  ~CudaEventHandle() {
+    CHECK_CUDA_ERROR(cudaEventDestroy(event_));
+  }
+
+  CudaEventHandle(const CudaEventHandle&) = delete;
+  CudaEventHandle& operator=(const CudaEventHandle&) = delete;
+
+  operator cudaEvent_t() const {
+    return event_;
+  }
+
+ private:
+  cudaEvent_t event_;
+};
+
+CudaEvent::CudaEvent() : event_(std::make_shared<CudaEventHandle>()) {}
+
+void CudaEvent::wait() {
+  nvtx3::scoped_range r("cu::CudaEvent::wait");
+  if (!recorded_) {
+    throw std::runtime_error("Should not wait on a CudaEvent before record.");
+  }
+  cudaEventSynchronize(*event_);
+}
+
+void CudaEvent::wait(cudaStream_t stream) {
+  if (!recorded_) {
+    throw std::runtime_error("Should not wait on a CudaEvent before record.");
+  }
+  cudaStreamWaitEvent(stream, *event_);
+}
+
+void CudaEvent::wait(Stream s) {
+  if (s.device == mlx::core::Device::cpu) {
+    scheduler::enqueue(s, [*this]() mutable { wait(); });
+  } else {
+    wait(cu::get_stream(s).last_cuda_stream());
+  }
+}
+
+void CudaEvent::record(cudaStream_t stream) {
+  cudaEventRecord(*event_, stream);
+  recorded_ = true;
+}
+
+void CudaEvent::record(Stream s) {
+  if (s.device == mlx::core::Device::cpu) {
+    throw std::runtime_error("CudaEvent can not wait on cpu stream.");
+  } else {
+    record(cu::get_stream(s).last_cuda_stream());
+  }
+}
+
+bool CudaEvent::completed() const {
+  return cudaEventQuery(*event_) == cudaSuccess;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// SharedEvent implementations
+///////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+__host__ __device__ void event_wait(SharedEvent::Atomic* ac, uint64_t value) {
+  uint64_t current;
+  while ((current = ac->load()) < value) {
+    ac->wait(current);
+  }
+}
+
+__host__ __device__ void event_signal(SharedEvent::Atomic* ac, uint64_t value) {
+  ac->store(value);
+  ac->notify_all();
+}
+
+__global__ void event_wait_kernel(SharedEvent::Atomic* ac, uint64_t value) {
+  event_wait(ac, value);
+}
+
+__global__ void event_signal_kernel(SharedEvent::Atomic* ac, uint64_t value) {
+  event_signal(ac, value);
+}
+
+} // namespace
+
+SharedEvent::SharedEvent() {
+  // Allocate cuda::atomic on managed memory.
+  Atomic* ac;
+  CHECK_CUDA_ERROR(cudaMallocManaged(&ac, sizeof(Atomic)));
+  new (ac) Atomic(0);
+  ac_ = std::shared_ptr<Atomic>(ac, [](Atomic* ptr) {
+    ptr->~Atomic();
+    allocator().cuda_free(ptr);
+  });
+}
+
+void SharedEvent::wait(uint64_t value) {
+  nvtx3::scoped_range r("cu::SharedEvent::wait");
+  event_wait(ac_.get(), value);
+}
+
+void SharedEvent::wait(cudaStream_t stream, uint64_t value) {
+  event_wait_kernel<<<1, 1, 0, stream>>>(ac_.get(), value);
+}
+
+void SharedEvent::wait(Stream s, uint64_t value) {
+  nvtx3::scoped_range r("cu::SharedEvent::wait(s)");
+  if (s.device == mlx::core::Device::cpu) {
+    scheduler::enqueue(s, [*this, value]() mutable { wait(value); });
+  } else {
+    auto& encoder = get_command_encoder(s);
+    encoder.launch_kernel(
+        encoder.stream().last_cuda_stream(),
+        [this, value](cudaStream_t stream) { wait(stream, value); });
+    encoder.add_completed_handler([ac = ac_]() {});
+    encoder.end_encoding();
+  }
+}
+
+void SharedEvent::signal(uint64_t value) {
+  nvtx3::scoped_range r("cu::SharedEvent::signal");
+  event_signal(ac_.get(), value);
+}
+
+void SharedEvent::signal(cudaStream_t stream, uint64_t value) {
+  event_signal_kernel<<<1, 1, 0, stream>>>(ac_.get(), value);
+}
+
+void SharedEvent::signal(Stream s, uint64_t value) {
+  nvtx3::scoped_range r("cu::SharedEvent::signal(s)");
+  if (s.device == mlx::core::Device::cpu) {
+    // Signal through a GPU stream so the atomic is updated in GPU - updating
+    // the atomic in CPU sometimes does not get GPU notified.
+    static CudaStream stream(device(mlx::core::Device::gpu));
+    scheduler::enqueue(s, [*this, value]() mutable { signal(stream, value); });
+  } else {
+    auto& encoder = get_command_encoder(s);
+    encoder.launch_kernel(
+        encoder.stream().last_cuda_stream(),
+        [this, value](cudaStream_t stream) { signal(stream, value); });
+    encoder.add_completed_handler([ac = ac_]() {});
+    encoder.end_encoding();
+  }
+}
+
+bool SharedEvent::is_signaled(uint64_t value) const {
+  nvtx3::scoped_range r("cu::SharedEvent::is_signaled");
+  return ac_->load() >= value;
+}
+
+uint64_t SharedEvent::value() const {
+  nvtx3::scoped_range r("cu::SharedEvent::value");
+  return ac_->load();
+}
+
+} // namespace cu
+
+///////////////////////////////////////////////////////////////////////////////
+// Event implementations
+///////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+struct EventImpl {
+  // CudaEvent is preferred when possible because it is fast, however we have
+  // to fallback to SharedEvent in following cases:
+  // 1. the event is used to wait/signal a cpu stream;
+  // 2. signal value other than 1 has been specified.
+  std::unique_ptr<cu::CudaEvent> cuda;
+  std::unique_ptr<cu::SharedEvent> shared;
+
+  bool is_created() const {
+    return cuda || shared;
+  }
+
+  void ensure_created(Stream s, uint64_t signal_value) {
+    if (is_created()) {
+      return;
+    }
+    if (s.device == mlx::core::Device::cpu || signal_value > 1) {
+      nvtx3::mark("Using slow SharedEvent");
+      shared = std::make_unique<cu::SharedEvent>();
+    } else {
+      cuda = std::make_unique<cu::CudaEvent>();
+    }
+  }
+};
+
+} // namespace
+
+Event::Event(Stream s) : stream_(s) {
+  event_ = std::shared_ptr<void>(
+      new EventImpl(), [](void* ptr) { delete static_cast<EventImpl*>(ptr); });
+}
+
+void Event::wait() {
+  auto* event = static_cast<EventImpl*>(event_.get());
+  assert(event->is_created());
+  if (event->cuda) {
+    assert(value() == 1);
+    event->cuda->wait();
+  } else {
+    event->shared->wait(value());
+  }
+}
+
+void Event::wait(Stream s) {
+  auto* event = static_cast<EventImpl*>(event_.get());
+  assert(event->is_created());
+  if (event->cuda) {
+    assert(value() == 1);
+    event->cuda->wait(s);
+  } else {
+    event->shared->wait(s, value());
+  }
+}
+
+void Event::signal(Stream s) {
+  auto* event = static_cast<EventImpl*>(event_.get());
+  event->ensure_created(s, value());
+  if (event->cuda) {
+    assert(value() == 1);
+    event->cuda->record(s);
+  } else {
+    event->shared->signal(s, value());
+  }
+}
+
+bool Event::is_signaled() const {
+  auto* event = static_cast<EventImpl*>(event_.get());
+  if (!event->is_created()) {
+    return false;
+  }
+  if (event->cuda) {
+    assert(value() == 1);
+    return event->cuda->recorded() && event->cuda->completed();
+  } else {
+    return event->shared->is_signaled(value());
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/event.h
+++ b/mlx/backend/cuda/event.h
@ -0,0 +1,66 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/stream.h"
+
+#include <cuda_runtime.h>
+#include <cuda/atomic>
+
+#include <memory>
+
+namespace mlx::core::cu {
+
+class CudaEventHandle;
+
+// Wrapper of native cuda event. It can synchronize between GPU streams, or wait
+// on GPU stream in CPU stream, but can not wait on CPU stream.
+class CudaEvent {
+ public:
+  CudaEvent();
+
+  void wait();
+  void wait(cudaStream_t stream);
+  void wait(Stream s);
+  void record(cudaStream_t stream);
+  void record(Stream s);
+
+  // Return whether the recorded kernels have completed. Note that this method
+  // returns true if record() has not been called.
+  bool completed() const;
+
+  bool recorded() const {
+    return recorded_;
+  }
+
+ private:
+  bool recorded_{false};
+  std::shared_ptr<CudaEventHandle> event_;
+};
+
+// Event that can synchronize between CPU and GPU. It is much slower than
+// CudaEvent so the latter should always be preferred when possible.
+class SharedEvent {
+ public:
+  using Atomic = cuda::atomic<uint64_t>;
+
+  SharedEvent();
+
+  void wait(uint64_t value);
+  void wait(cudaStream_t stream, uint64_t value);
+  void wait(Stream s, uint64_t value);
+  void signal(uint64_t value);
+  void signal(cudaStream_t stream, uint64_t value);
+  void signal(Stream s, uint64_t value);
+  bool is_signaled(uint64_t value) const;
+  uint64_t value() const;
+
+  const std::shared_ptr<Atomic>& atomic() const {
+    return ac_;
+  }
+
+ private:
+  std::shared_ptr<Atomic> ac_;
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/fence.cpp
+++ b/mlx/backend/cuda/fence.cpp
@ -0,0 +1,29 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/fence.h"
+#include "mlx/backend/cuda/event.h"
+
+namespace mlx::core {
+
+struct FenceImpl {
+  uint32_t count;
+  cu::SharedEvent event;
+};
+
+Fence::Fence(Stream s) {
+  fence_ = std::shared_ptr<void>(
+      new FenceImpl{0}, [](void* ptr) { delete static_cast<FenceImpl*>(ptr); });
+}
+
+void Fence::wait(Stream s, const array&) {
+  auto* fence = static_cast<FenceImpl*>(fence_.get());
+  fence->event.wait(fence->count);
+}
+
+void Fence::update(Stream s, const array&) {
+  auto* fence = static_cast<FenceImpl*>(fence_.get());
+  fence->count++;
+  fence->event.signal(s, fence->count);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@ -0,0 +1,420 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/compiled.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/jit_module.h"
+#include "mlx/backend/gpu/copy.h"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include "cuda_jit_sources.h"
+
+#include <fmt/format.h>
+#include <nvtx3/nvtx3.hpp>
+
+#include <cassert>
+#include <numeric>
+
+namespace mlx::core {
+
+namespace {
+
+constexpr const char* g_scatter_ops[] = {"Max", "Min", "Sum", "Prod", "Assign"};
+
+void append_indices_arg(
+    cu::JitModule& mod,
+    const std::vector<array>& inputs,
+    int nidx,
+    int idx_ndim) {
+  std::vector<const void*> indices(nidx);
+  for (int i = 0; i < nidx; ++i) {
+    indices[i] = inputs[i + 1].data<void>();
+  }
+  mod.append_arg(std::move(indices));
+  std::vector<int32_t> indices_shape(nidx * idx_ndim);
+  for (int i = 0; i < nidx; ++i) {
+    std::copy_n(
+        inputs[i + 1].shape().begin(),
+        idx_ndim,
+        indices_shape.data() + i * idx_ndim);
+  }
+  mod.append_arg(std::move(indices_shape));
+  std::vector<int64_t> indices_strides(nidx * idx_ndim);
+  for (int i = 0; i < nidx; ++i) {
+    std::copy_n(
+        inputs[i + 1].strides().begin(),
+        idx_ndim,
+        indices_strides.data() + i * idx_ndim);
+  }
+  mod.append_arg(std::move(indices_strides));
+}
+
+} // namespace
+
+void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Gather::eval_gpu");
+  assert(inputs.size() > 0);
+  const auto& src = inputs[0];
+
+  out.set_data(allocator::malloc(out.nbytes()));
+  if (out.size() == 0) {
+    return;
+  }
+
+  int nidx = inputs.size() - 1;
+  Dtype idx_dtype = nidx > 0 ? inputs[1].dtype() : int32;
+  int32_t idx_ndim = nidx > 0 ? inputs[1].ndim() : 0;
+
+  bool large = (nidx > 0 && inputs[1].size() > INT32_MAX) ||
+      (src.size() > INT32_MAX) || (out.size() > INT32_MAX);
+
+  uint32_t slice_size = std::accumulate(
+      slice_sizes_.begin(), slice_sizes_.end(), 1, std::multiplies<uint32_t>());
+
+  std::string module_name = fmt::format(
+      "gather_{}_{}_{}",
+      dtype_to_string(out.dtype()),
+      dtype_to_string(idx_dtype),
+      nidx);
+
+  auto& s = stream();
+  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
+    std::vector<std::string> kernel_names;
+    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
+      for (int large = 0; large <= 1; ++large) {
+        kernel_names.push_back(fmt::format(
+            "mlx::core::cu::gather<{}, {}, {}, {}, {}>",
+            dtype_to_cuda_type(out.dtype()),
+            dtype_to_cuda_type(idx_dtype),
+            nidx,
+            ndim,
+            large ? "int64_t" : "int32_t"));
+      }
+    }
+    return std::make_pair(jit_source_gather, std::move(kernel_names));
+  });
+
+  mod.append_arg(src);
+  mod.append_arg(out);
+  if (large) {
+    mod.append_arg<int64_t>(out.size());
+  } else {
+    mod.append_arg<int32_t>(out.size());
+  }
+  mod.append_ndim_arg(src.shape());
+  mod.append_ndim_arg(src.strides());
+  mod.append_arg<int32_t>(src.ndim());
+  mod.append_ndim_arg(slice_sizes_);
+  mod.append_arg(slice_size);
+  mod.append_arg(axes_);
+  append_indices_arg(mod, inputs, nidx, idx_ndim);
+
+  std::string kernel_name = fmt::format(
+      "mlx::core::cu::gather<{}, {}, {}, {}, {}>",
+      dtype_to_cuda_type(out.dtype()),
+      dtype_to_cuda_type(idx_dtype),
+      nidx,
+      idx_ndim,
+      large ? "int64_t" : "int32_t");
+
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  encoder.set_output_array(out);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    mod.launch_kernel(stream, kernel_name, out, large);
+  });
+}
+
+void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Gather::eval_gpu");
+  assert(inputs.size() > 1);
+  auto& upd = inputs.back();
+
+  // Copy src into out.
+  CopyType copy_type;
+  if (inputs[0].data_size() == 1) {
+    copy_type = CopyType::Scalar;
+  } else if (inputs[0].flags().row_contiguous) {
+    copy_type = CopyType::Vector;
+  } else {
+    copy_type = CopyType::General;
+  }
+  copy_gpu(inputs[0], out, copy_type);
+
+  // Empty update.
+  if (upd.size() == 0) {
+    return;
+  }
+
+  int nidx = axes_.size();
+  Dtype idx_dtype = nidx > 0 ? inputs[1].dtype() : int32;
+  int32_t idx_ndim = nidx > 0 ? inputs[1].ndim() : 0;
+
+  bool large = (nidx > 0 && inputs[1].size() > INT32_MAX) ||
+      (upd.size() > INT32_MAX) || (out.size() > INT32_MAX);
+
+  int32_t upd_post_idx_size = std::accumulate(
+      upd.shape().begin() + idx_ndim,
+      upd.shape().end(),
+      1,
+      std::multiplies<int32_t>());
+
+  const char* op = g_scatter_ops[reduce_type_];
+  std::string module_name = fmt::format(
+      "scatter_{}_{}_{}_{}",
+      dtype_to_string(out.dtype()),
+      dtype_to_string(idx_dtype),
+      op,
+      nidx);
+
+  auto& s = stream();
+  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
+    std::vector<std::string> kernel_names;
+    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
+      for (int large = 0; large <= 1; ++large) {
+        kernel_names.push_back(fmt::format(
+            "mlx::core::cu::scatter<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}>",
+            dtype_to_cuda_type(out.dtype()),
+            dtype_to_cuda_type(idx_dtype),
+            op,
+            nidx,
+            ndim,
+            large ? "int64_t" : "int32_t"));
+      }
+    }
+    return std::make_pair(jit_source_scatter, std::move(kernel_names));
+  });
+
+  mod.append_arg(upd);
+  mod.append_arg(out);
+  if (large) {
+    mod.append_arg<int64_t>(upd.size());
+  } else {
+    mod.append_arg<int32_t>(upd.size());
+  }
+  mod.append_ndim_arg(upd.shape());
+  mod.append_ndim_arg(upd.strides());
+  mod.append_arg<int32_t>(upd.ndim());
+  if (large) {
+    mod.append_arg<int64_t>(upd_post_idx_size);
+  } else {
+    mod.append_arg<int32_t>(upd_post_idx_size);
+  }
+  mod.append_ndim_arg(out.shape());
+  mod.append_ndim_arg(out.strides());
+  mod.append_arg<int32_t>(out.ndim());
+  mod.append_arg(axes_);
+  append_indices_arg(mod, inputs, nidx, idx_ndim);
+
+  std::string kernel_name = fmt::format(
+      "mlx::core::cu::scatter<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}>",
+      dtype_to_cuda_type(out.dtype()),
+      dtype_to_cuda_type(idx_dtype),
+      op,
+      nidx,
+      idx_ndim,
+      large ? "int64_t" : "int32_t");
+
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  encoder.set_output_array(out);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    mod.launch_kernel(stream, kernel_name, upd, large);
+  });
+}
+
+void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("GatherAxis::eval_gpu");
+  assert(inputs.size() > 1);
+  const auto& src = inputs[0];
+  const auto& idx = inputs[1];
+
+  out.set_data(allocator::malloc(out.nbytes()));
+  if (out.size() == 0) {
+    return;
+  }
+
+  bool large = idx.size() > INT32_MAX || src.size() > INT32_MAX;
+
+  std::string module_name = fmt::format(
+      "gather_axis_{}_{}",
+      dtype_to_string(out.dtype()),
+      dtype_to_string(idx.dtype()));
+
+  auto& s = stream();
+  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
+    std::vector<std::string> kernel_names;
+    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
+      for (int contiguous = 0; contiguous < 4; ++contiguous) {
+        for (int large = 0; large <= 1; ++large) {
+          kernel_names.push_back(fmt::format(
+              "mlx::core::cu::gather_axis<{}, {}, {}, {}, {}, {}>",
+              dtype_to_cuda_type(out.dtype()),
+              dtype_to_cuda_type(idx.dtype()),
+              ndim,
+              contiguous & 1 ? true : false,
+              contiguous & 2 ? true : false,
+              large ? "int64_t" : "int32_t"));
+        }
+      }
+    }
+    return std::make_pair(jit_source_gather_axis, std::move(kernel_names));
+  });
+
+  size_t idx_size_pre = 1;
+  size_t idx_size_post = 1;
+  for (int i = 0; i < axis_; ++i) {
+    idx_size_pre *= idx.shape(i);
+  }
+  for (int i = axis_ + 1; i < idx.ndim(); ++i) {
+    idx_size_post *= idx.shape(i);
+  }
+  size_t idx_size_axis = idx.shape(axis_);
+
+  mod.append_arg(src);
+  mod.append_arg(idx);
+  mod.append_arg(out);
+  if (large) {
+    mod.append_arg<int64_t>(idx_size_pre);
+    mod.append_arg<int64_t>(idx_size_axis);
+    mod.append_arg<int64_t>(idx_size_post);
+  } else {
+    mod.append_arg<int32_t>(idx_size_pre);
+    mod.append_arg<int32_t>(idx_size_axis);
+    mod.append_arg<int32_t>(idx_size_post);
+  }
+  mod.append_arg(remove_index(idx.shape(), axis_));
+  mod.append_arg(remove_index(src.strides(), axis_));
+  mod.append_arg(remove_index(idx.strides(), axis_));
+  mod.append_arg<int32_t>(axis_);
+  mod.append_arg(src.shape(axis_));
+  mod.append_arg(src.strides(axis_));
+  mod.append_arg(idx.strides(axis_));
+
+  std::string kernel_name = fmt::format(
+      "mlx::core::cu::gather_axis<{}, {}, {}, {}, {}, {}>",
+      dtype_to_cuda_type(out.dtype()),
+      dtype_to_cuda_type(idx.dtype()),
+      src.ndim() - 1,
+      src.flags().row_contiguous,
+      idx.flags().row_contiguous,
+      large ? "int64_t" : "int32_t");
+
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  encoder.set_output_array(out);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    mod.launch_kernel(stream, kernel_name, idx, large);
+  });
+}
+
+void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("ScatterAxis::eval_gpu");
+  assert(inputs.size() > 2);
+  const auto& src = inputs[0];
+  const auto& idx = inputs[1];
+  const auto& upd = inputs[2];
+
+  // Copy src into out.
+  CopyType copy_type;
+  if (src.data_size() == 1) {
+    copy_type = CopyType::Scalar;
+  } else if (src.flags().row_contiguous) {
+    copy_type = CopyType::Vector;
+  } else {
+    copy_type = CopyType::General;
+  }
+  copy_gpu(src, out, copy_type);
+
+  // Empty update.
+  if (upd.size() == 0) {
+    return;
+  }
+
+  bool large = idx.size() > INT32_MAX || src.size() > INT32_MAX;
+
+  const char* op = reduce_type_ == ScatterAxis::Sum ? "Sum" : "Assign";
+  std::string module_name = fmt::format(
+      "scatter_axis_{}_{}_{}",
+      dtype_to_string(out.dtype()),
+      dtype_to_string(idx.dtype()),
+      op);
+
+  auto& s = stream();
+  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
+    std::vector<std::string> kernel_names;
+    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
+      for (int contiguous = 0; contiguous < 4; ++contiguous) {
+        for (int large = 0; large <= 1; ++large) {
+          kernel_names.push_back(fmt::format(
+              "mlx::core::cu::scatter_axis<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}, {}>",
+              dtype_to_cuda_type(out.dtype()),
+              dtype_to_cuda_type(idx.dtype()),
+              op,
+              ndim,
+              contiguous & 1 ? true : false,
+              contiguous & 2 ? true : false,
+              large ? "int64_t" : "int32_t"));
+        }
+      }
+    }
+    return std::make_pair(jit_source_scatter_axis, std::move(kernel_names));
+  });
+
+  size_t idx_size_pre = 1;
+  size_t idx_size_post = 1;
+  for (int i = 0; i < axis_; ++i) {
+    idx_size_pre *= idx.shape(i);
+  }
+  for (int i = axis_ + 1; i < idx.ndim(); ++i) {
+    idx_size_post *= idx.shape(i);
+  }
+  size_t idx_size_axis = idx.shape(axis_);
+
+  mod.append_arg(upd);
+  mod.append_arg(idx);
+  mod.append_arg(out);
+  if (large) {
+    mod.append_arg<int64_t>(idx_size_pre);
+    mod.append_arg<int64_t>(idx_size_axis);
+    mod.append_arg<int64_t>(idx_size_post);
+  } else {
+    mod.append_arg<int32_t>(idx_size_pre);
+    mod.append_arg<int32_t>(idx_size_axis);
+    mod.append_arg<int32_t>(idx_size_post);
+  }
+  mod.append_arg(remove_index(idx.shape(), axis_));
+  mod.append_arg(remove_index(upd.strides(), axis_));
+  mod.append_arg(remove_index(idx.strides(), axis_));
+  mod.append_arg<int32_t>(axis_);
+  mod.append_arg(out.shape(axis_));
+  mod.append_arg(upd.strides(axis_));
+  mod.append_arg(idx.strides(axis_));
+
+  std::string kernel_name = fmt::format(
+      "mlx::core::cu::scatter_axis<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}, {}>",
+      dtype_to_cuda_type(out.dtype()),
+      dtype_to_cuda_type(idx.dtype()),
+      op,
+      idx.ndim() - 1,
+      upd.flags().row_contiguous,
+      idx.flags().row_contiguous,
+      large ? "int64_t" : "int32_t");
+
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : inputs) {
+    encoder.set_input_array(in);
+  }
+  encoder.set_output_array(out);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    mod.launch_kernel(stream, kernel_name, idx, large);
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/iterators/general_iterator.cuh
+++ b/mlx/backend/cuda/iterators/general_iterator.cuh
@ -0,0 +1,121 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <cuda/std/utility>
+
+#include "mlx/backend/cuda/kernel_utils.cuh"
+
+namespace mlx::core::cu {
+
+// Iterating non-contiguous array.
+template <typename Iterator, typename IdxT = int64_t>
+class general_iterator
+    : public thrust::
+          iterator_adaptor<general_iterator<Iterator, IdxT>, Iterator> {
+ public:
+  using super_t =
+      thrust::iterator_adaptor<general_iterator<Iterator, IdxT>, Iterator>;
+
+  using reference = typename super_t::reference;
+  using difference_type = typename super_t::difference_type;
+
+  __host__ __device__ general_iterator(
+      Iterator it,
+      IdxT index,
+      int ndim,
+      Shape shape,
+      Strides strides)
+      : super_t(it),
+        index_(index),
+        ndim_(ndim),
+        shape_(cuda::std::move(shape)),
+        strides_(cuda::std::move(strides)) {}
+
+  __host__ __device__ IdxT index() const {
+    return index_;
+  }
+
+  __host__ __device__ const Shape& shape() const {
+    return shape_;
+  }
+
+  __host__ __device__ const Strides& strides() const {
+    return strides_;
+  }
+
+ private:
+  friend class thrust::iterator_core_access;
+
+  __host__ __device__ bool equal(const general_iterator& other) const {
+    return this->base() == other.base() && this->index() == other.index();
+  }
+
+  __host__ __device__ void advance(difference_type n) {
+    this->index_ += n;
+  }
+
+  __host__ __device__ void increment() {
+    this->index_ += 1;
+  }
+
+  __host__ __device__ void decrement() {
+    this->index_ -= 1;
+  }
+
+  __host__ __device__ difference_type
+  distance_to(const general_iterator& other) const {
+    _CCCL_ASSERT(
+        this->base() == other.base(),
+        "Underlying iterator must point to same base iterator");
+    return other.index() - this->index();
+  }
+
+  // The dereference is device-only to avoid accidental running in host.
+  __device__ typename super_t::reference dereference() const {
+    IdxT offset = elem_to_loc(index_, shape_.data(), strides_.data(), ndim_);
+    return *(this->base() + offset);
+  }
+
+  IdxT index_;
+  int ndim_;
+  Shape shape_;
+  Strides strides_;
+};
+
+template <typename IdxT, typename Iterator>
+__host__ __device__ auto make_general_iterator(
+    Iterator it,
+    IdxT index,
+    int ndim,
+    Shape shape,
+    Strides strides) {
+  return general_iterator<Iterator, IdxT>(
+      it, index, ndim, cuda::std::move(shape), cuda::std::move(strides));
+}
+
+template <typename IdxT, typename Iterator>
+auto make_general_iterator(
+    Iterator it,
+    const std::vector<int32_t>& shape,
+    const std::vector<int64_t>& strides) {
+  return make_general_iterator<IdxT>(
+      it, 0, shape.size(), const_param(shape), const_param(strides));
+}
+
+template <typename IdxT, typename Iterator>
+auto make_general_iterators(
+    Iterator it,
+    IdxT size,
+    const std::vector<int32_t>& shape,
+    const std::vector<int64_t>& strides) {
+  auto ndim = shape.size();
+  auto shape_arg = const_param(shape);
+  auto strides_arg = const_param(strides);
+  return std::make_pair(
+      make_general_iterator<IdxT>(it, 0, ndim, shape_arg, strides_arg),
+      make_general_iterator<IdxT>(it, size, ndim, shape_arg, strides_arg));
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/iterators/strided_iterator.cuh
+++ b/mlx/backend/cuda/iterators/strided_iterator.cuh
@ -0,0 +1,60 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_facade.h>
+
+namespace mlx::core::cu {
+
+// RandomAccessIterator for strided access to array entries.
+template <typename Iterator, typename Stride = int64_t>
+class strided_iterator
+    : public thrust::
+          iterator_adaptor<strided_iterator<Iterator, Stride>, Iterator> {
+ public:
+  using super_t =
+      thrust::iterator_adaptor<strided_iterator<Iterator, Stride>, Iterator>;
+
+  using reference = typename super_t::reference;
+  using difference_type = typename super_t::difference_type;
+
+  __host__ __device__ strided_iterator(Iterator it, Stride stride)
+      : super_t(it), stride_(stride) {}
+
+  __host__ __device__ Stride stride() const {
+    return stride_;
+  }
+
+ private:
+  friend class thrust::iterator_core_access;
+
+  __host__ __device__ bool equal(const strided_iterator& other) const {
+    return this->base() == other.base();
+  }
+
+  __host__ __device__ void advance(difference_type n) {
+    this->base_reference() += n * stride_;
+  }
+
+  __host__ __device__ void increment() {
+    this->base_reference() += stride_;
+  }
+
+  __host__ __device__ void decrement() {
+    this->base_reference() -= stride_;
+  }
+
+  __host__ __device__ difference_type
+  distance_to(const strided_iterator& other) const {
+    const difference_type dist = other.base() - this->base();
+    _CCCL_ASSERT(
+        dist % stride() == 0,
+        "Underlying iterator difference must be divisible by the stride");
+    return dist / stride();
+  }
+
+  Stride stride_;
+};
+
+} // namespace mlx::core::cu
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Angelos Katharopoulos	5adf185f86	Fix `update_modules()` when providing a subset (#2308 )	2025-06-20 17:19:46 -07:00
Awni Hannun	c9a9180584	Cuda perf tuning (#2307 ) * perf tuning * fix adding inputs arrays in matmul / srot * format * fix	2025-06-20 14:50:57 -07:00
Awni Hannun	76831ed83d	Build CUDA release in Circle (#2306 ) * cuda release * add license	2025-06-19 15:26:36 -07:00
Angelos Katharopoulos	b3d7b85376	Make ptx cache settable by environment variable (#2304 )	2025-06-17 23:55:56 -07:00
Awni Hannun	cad5c0241c	[CUDA] synch properly waits for all tasks to finish and clear (#2303 ) * cuda synch properly waits for all tasks to finish and clear * fix copy	2025-06-17 12:03:25 -07:00
Awni Hannun	b8022c578a	divmod, partition, sort fixes (#2302 )	2025-06-16 18:49:32 -07:00
Awni Hannun	bc53f8293f	Cuda bug fixes 2 (#2298 ) * more bug fixes * more bug fixes * format	2025-06-16 13:14:46 -07:00
Awni Hannun	c552ff2451	[CUDA] Fix back-end bugs and enable corresponding tests (#2296 ) * Fix some cuda back-end bugs and enable corresponding tests * more fixes * enable more tests * format	2025-06-16 08:45:40 -07:00
Awni Hannun	4fda5fbdf9	add python testing for cuda with ability to skip list of tests (#2295 )	2025-06-15 10:56:48 -07:00
Angelos Katharopoulos	580776559b	RoPE for CUDA (#2293 ) * First working CUDA rope * Fix random	2025-06-15 06:08:07 -07:00
Awni Hannun	a14aaa7c9d	Fix cuda arg reduce (#2291 )	2025-06-14 17:54:00 -07:00
Awni Hannun	a6d780154f	fix cuda gemm for bf16 (#2288 )	2025-06-13 22:10:46 -07:00
Awni Hannun	6871e2eeb7	fix cuda jit (#2287 )	2025-06-13 19:21:46 -07:00
Awni Hannun	8402a2acf4	Fix complex power and print (#2286 ) * fix complex power and print * fix complex matmul shape	2025-06-13 11:13:00 -07:00
Jagrit Digani	fddb6933e1	Collection of refactors (#2274 ) * Refactor gemv into a function * Refactor splitk step 1 * Refactor split k axpby * Rearrange steel_gemm_regular * Redirect steel_gemm_regular * Add axpby routing to steel_matmul_regular * Refactor AddMM step 1 * Redirect steel_gemm * Update addmm * Comments and format * Some cleanup * Add architecture gen to device * Update no copy condition in normalization to account for axis size 1	2025-06-13 10:44:56 -07:00
Cheng	c8b4787e4e	CUDA backend: indexing ops (#2277 )	2025-06-12 21:44:19 -07:00
Awni Hannun	2188199ff8	[CUDA] ternary with select op (#2283 ) * cuda ternary with select op * comment + fix * fix	2025-06-12 20:24:43 -07:00
Awni Hannun	aa07429bad	Fix cuda build (#2284 )	2025-06-12 17:48:05 -07:00
Awni Hannun	918761a25a	[CUDA] RMSNorm and VJP (#2280 ) * rms norm start * nit	2025-06-12 17:09:49 -07:00
Cheng	a4fc671d3e	CUDA backend: compile (#2276 ) * CUDA backend: compile * Rename kernels/ to device/	2025-06-12 17:08:39 -07:00
Awni Hannun	f5f65ef48c	Make sliceUpdate general (#2282 ) * Make sliceUpdate general * fix	2025-06-12 16:48:54 -07:00
Cheng	c2dd81a8aa	Fix warnings from latest CUDA toolkit (#2275 )	2025-06-12 06:03:01 -07:00
Cheng	d7e680ffe4	CUDA backend: layernorm (#2271 )	2025-06-11 15:48:32 -07:00
Cheng	c371baf53a	CUDA backend: softmax (#2272 )	2025-06-11 13:55:22 -07:00
Cheng	ccf78f566c	CUDA backend: argreduce (#2270 )	2025-06-11 13:26:17 -07:00
Cheng	c9fa68664a	CUDA backend: reduce (#2269 )	2025-06-11 11:22:25 -07:00
Awni Hannun	c35f4d089a	start cuda circle config (#2256 ) * rebase * fix metal kernel linking issue on cuda * start cuda circle config	2025-06-10 21:19:47 -07:00
Angelos Katharopoulos	8590c0941e	Add load_safe to the general conv loaders (#2258 )	2025-06-10 20:58:16 -07:00
Cheng	095163b8d1	Fix building cpp benchmarks on Linux (#2268 )	2025-06-10 17:10:24 -07:00
Cheng	99c33d011d	rebase + nit (#2260 ) Co-authored-by: Awni Hannun <awni@apple.com>	2025-06-10 10:51:51 -07:00
Awni Hannun	62fecf3e13	fix conv export (#2265 )	2025-06-10 09:34:01 -07:00
Cheng	7c4eb5d03e	CUDA backend: random (#2261 )	2025-06-10 08:59:56 -07:00
Cheng	bae9a6b404	CUDA backend: sort (#2262 ) Co-authored-by: Awni Hannun <awni@apple.com>	2025-06-10 08:59:47 -07:00
Christopher Fleetwood	004c1d8ef2	Report number of missing parameters (#2264 ) * chore: inform * chore: format --------- Co-authored-by: FL33TW00D <FL33TW00D@users.noreply.github.com>	2025-06-10 06:37:50 -07:00
Cheng	7ebb2e0193	CUDA backend: binary ops (#2259 )	2025-06-10 06:37:40 -07:00
Awni Hannun	9ce77798b1	fix export to work with gather/scatter axis (#2263 )	2025-06-09 20:37:27 -07:00
Cheng	f8bad60609	CUDA backend: unary ops (#2158 )	2025-06-09 06:45:08 -07:00
Emmanuel Ferdman	5866b3857b	Refactor the lu test (#2250 ) Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>	2025-06-07 06:12:08 -07:00
Awni Hannun	1ca616844b	Fix unintuitive metal kernel caching (#2242 ) * Fix unintuitive metal kernel caching * alternative solution	2025-06-06 20:08:15 -07:00
Angelos Katharopoulos	2e8cf0b450	Change layernorms to two pass algorithm (#2246 )	2025-06-06 13:34:56 -07:00
Cheng	24f89173d1	CUDA backend: matmul (#2241 )	2025-06-06 12:24:04 -07:00
Awni Hannun	c6a20b427a	Improve metal elementwise kernels (#2247 ) * improve metal elementwise kernels * compile and copy * fix jit	2025-06-06 11:37:40 -07:00
Awni Hannun	a5ac9244c4	fix linux linking error (#2248 )	2025-06-06 10:41:51 -07:00
Awni Hannun	c763fe1be0	default strict mode for module update and update_modules (#2239 )	2025-06-05 15:27:02 -07:00
Cheng	52dc8c8cd5	Add profiler annotations in common primitives for CUDA backend (#2244 )	2025-06-04 19:55:12 -07:00
Angelos Katharopoulos	aede70e81d	Perf regression fix (#2243 )	2025-06-03 17:55:12 -07:00
Cheng	85a8beb5e4	Avoid atomic updates across CPU/GPU in CUDA event (#2231 )	2025-06-03 16:49:06 -07:00
Cheng	0bb89e9e5f	Share more common code in Compiled (#2240 ) * Share more common code in Compiled * Remove build_lib_name	2025-06-03 16:48:50 -07:00
Cheng	5685ceb3c7	Avoid invoking allocator::malloc when creating CUDA event (#2232 )	2025-06-03 16:48:40 -07:00
Suryash Malviya	0408ba0a76	Optimizing Complex Matrix Multiplication using Karatsuba’s Algorithm (#2220 ) * Implementing Complex Matmul using Karatsuba Algorithm * Implemented Karatsuba's Algorithm for complex matmul and pre-commit them * fix --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-06-02 15:58:46 -07:00
Awni Hannun	cbad6c3093	version (#2237 )	2025-06-02 15:58:33 -07:00
Cheng	1b021f6984	Fast primitives decide when to use the fallback (#2216 )	2025-06-02 13:26:37 -07:00
Cheng	95b7551d65	Do not check event.is_signaled() in eval_impl (#2230 )	2025-06-02 13:23:34 -07:00
Cheng	db5a7c6192	Add memory cache to CUDA backend (#2221 ) * Move BufferCache out of allocator * Add memory cache to cuda backend allocator * Simplify BufferCache assuming buf can not be null	2025-05-30 12:12:54 -07:00
Awni Hannun	6ef2f67e7f	5bit quants (#2226 ) * 5bit quants * 5bit quants	2025-05-30 12:12:10 -07:00
Cheng	f76ee1ffd2	Move some dims utils to common (#2223 )	2025-05-29 06:48:30 -07:00
Cheng	54a71f270a	Remove unused defines (#2217 )	2025-05-23 06:14:58 -07:00
Awni Hannun	55b4062dd8	copyright in docs (#2214 )	2025-05-21 17:13:04 -07:00
Cheng	79071bfba4	Fix out-of-bounds default value in logsumexp/softmax (#2213 )	2025-05-21 07:25:16 -07:00
Cheng	7774b87cbd	Remove redundant simd_sum in logsumexp (#2210 )	2025-05-21 07:25:03 -07:00
Cheng	35c87741cf	Build for compute capability 70 instead of 75 (#2209 )	2025-05-20 19:42:48 -07:00
Jack Wind	4cbe605214	Feat: Allow per-target Metal debug flags (#2201 ) * feat: allow per-target Metal debug flags * formatting fix	2025-05-20 10:22:26 -07:00
Clement Liaw	ab8883dd55	include mlx::core::version() symbols in the mlx static library (#2207 )	2025-05-20 07:39:11 -07:00
Awni Hannun	eebe73001a	fix large arg reduce (#2206 )	2025-05-19 13:10:44 -07:00
Angelos Katharopoulos	0359bf02c9	Nearest upsample (#2202 )	2025-05-19 11:23:38 -07:00
Cheng	237f9e58a8	Fix BEFORE keyword in target_include_directories (#2204 )	2025-05-19 06:10:44 -07:00
Awni Hannun	8576e6fe36	fix conv2d bug + faster conv 1d (#2195 ) * fix conv2d bug + faster conv 1d * revert sort + flaky test	2025-05-18 06:05:11 -07:00
Angelos Katharopoulos	0654543dcc	Add complex eigh (#2191 )	2025-05-18 00:18:43 -07:00
Awni Hannun	48ef3e74e2	reduce vjp for all and any (#2193 )	2025-05-16 08:38:49 -07:00
Cheng	7d4b378952	Include cuda_bf16.h for bfloat16 overloads (#2192 ) * Include cuda_bf16.h for bfloat16 overloads * Add NO_GPU_MULTI(Eig) in cuda backend	2025-05-16 06:44:42 -07:00
Jack Wind	7ff5c41e06	Add set_threadgroup_memory_length to CommandEncoder (#2183 )	2025-05-16 00:28:03 -07:00
Awni Hannun	602f43e3d1	fix conv grad (#2187 )	2025-05-15 19:20:36 -07:00
Awni Hannun	a2cadb8218	real and imag properties (#2189 )	2025-05-15 18:17:50 -07:00
Awni Hannun	c1eb9d05d9	non-symmetric eig and eigh (#2188 )	2025-05-15 13:01:44 -07:00
Angelos Katharopoulos	cf6c939e86	Fix some complex vjps (#2178 )	2025-05-14 23:37:12 -07:00
Angelos Katharopoulos	130df35e1b	Add random normal distribution for complex numbers (#2182 )	2025-05-13 22:43:45 -07:00
Cheng	0751263dec	Fix typo in row_reduce_small (#2179 )	2025-05-13 20:19:54 -07:00
Cheng	eca2f3eb97	Add remove_index utility (#2173 )	2025-05-13 17:09:56 -07:00
Angelos Katharopoulos	3aa9cf3f9e	Fix put_along_axis for empty arrays (#2181 )	2025-05-13 14:27:53 -07:00
Awni Hannun	8f3d208dce	Close a couple edge case bugs: hadamard and addmm on empty inputs (#2177 ) * handle hadamard and addmm on empty inputs * fix	2025-05-12 10:48:57 -07:00
Ivan Fioravanti	caaa3f1f8c	Small typos in mx.metal deprecations (#2176 )	2025-05-11 06:03:47 -07:00
Awni Hannun	659a51919f	patch bump (#2162 )	2025-05-09 14:35:14 -07:00
Awni Hannun	6661387066	Fix fft for integer overflow (#2161 )	2025-05-09 14:25:12 -07:00
ATurker	a7fae8a176	fix: conv_general differences between gpu, cpu (#2070 ) * fix general_conv padding * fix bugs * add test --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-05-09 10:26:52 -07:00
Cheng	0cae0bdac8	CUDA backend: backbone (#2075 )	2025-05-06 21:26:46 -07:00
Awni Hannun	5a1a5d5ed1	fix input coherent kernel launch (#2153 )	2025-05-05 17:30:50 -07:00
Cheng	1683975acf	Move common gpu primitives to backend/gpu (#2145 )	2025-05-05 13:45:29 -07:00
Awni Hannun	af705590ac	fix batched vector sdpa (#2152 )	2025-05-05 13:13:03 -07:00
Awni Hannun	825124af8f	fix bw for elementwise ops (#2151 ) * fix bw for elementwise ops * add compile * fix * fix * fix * fix	2025-05-05 06:15:04 -07:00
Awni Hannun	9c5e7da507	fix compile merging (#2150 )	2025-05-02 15:08:50 -07:00
Angelos Katharopoulos	481349495b	GPU Hadamard for large N (#1879 )	2025-05-01 17:19:17 -07:00
Awni Hannun	9daa6b003f	fix shapeless export (#2148 )	2025-05-01 15:02:02 -07:00
Angelos Katharopoulos	a3a632d567	Fix the launcher when ran locally (#2147 )	2025-05-01 12:56:09 -07:00
Awni Hannun	e496c5a4b4	fix integer overflow in qmm (#2143 )	2025-04-30 09:28:56 -07:00
Cheng	ea890d8710	Remove metal-only tests (#2139 )	2025-04-30 09:08:39 -07:00
Awni Hannun	aa5d84f102	Allow quant layer to be unfrozen (#2142 )	2025-04-30 09:08:29 -07:00
Awni Hannun	f1606486d2	Generalize gpu backend (#2138 ) * generalize gpu backend * fix no_gpu build * fix no_gpu build * generalize gpu backend	2025-04-30 09:08:17 -07:00
Cheng	87720a8908	Fix building with uv (#2141 )	2025-04-30 06:04:07 -07:00
Aashiq Dheeraj	bb6565ef14	add fftshift and ifftshift fft helpers (#2135 ) * add fftshift and ifftshift fft helpers * address comments * axes have to be iterable * fix fp error in roll + add test --------- Co-authored-by: Aashiq Dheeraj <aashiq@aashiq-mbp-m4.local>	2025-04-29 22:13:45 -07:00
Awni Hannun	7bb063bcb3	Enable vjp for quantized scale and bias (#2129 ) * Enable vjp for quantized scale and bias * higher tol	2025-04-29 13:03:09 -07:00
Alex Chi Z.	b36dd472bb	return library if it is successfully loaded (#2131 )	2025-04-29 07:30:36 -07:00
hdeng-apple	167b759a38	Fix typos (#2136 )	2025-04-29 07:26:05 -07:00
charan-003	99b9868859	Clarify dimension notation in conv1d, conv2d, and conv3d docstrings (#2123 ) * Clarify dimension notation in conv1d, conv2d, and conv3d docstrings * Updating transposed convs in conv1d, conv2d, and conv3d --------- Co-authored-by: Sai Charan Arvapally <saicharan@Sais-MacBook-Pro.local>	2025-04-25 12:18:30 -07:00
1ndig0	6b2d5448f2	Fix the error message in `mx.right_shift` and `mx.left_shift` (#2121 ) * update right_shift and lef_shift * simplify --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-04-25 09:14:28 -07:00
Awni Hannun	eaf709b83e	patch (#2119 )	2025-04-24 16:11:07 -07:00
Angelos Katharopoulos	f0e70afff0	Fix swift pm load (#2117 )	2025-04-24 10:58:29 -07:00
hdeng-apple	86984cad68	Remove static initializers (#2059 ) * Remove static initializers in device.cpp, load.cpp, pocketfft.h * Remove static initializer InTracing::trace_stack * Remove static initializer of CompilerCache cache * Revert changes in pocketfft.h * Remove duplicate private section of thread_pool()	2025-04-24 06:14:49 -07:00
Awni Hannun	fbc89e3ced	fix pinv (#2110 )	2025-04-23 13:08:28 -07:00
hdeng-apple	38c1e720c2	Search mlx.metallib in macOS framework "Resources" dir (#2061 ) --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com>	2025-04-23 09:53:13 -07:00
Param Thakkar	600e87e03c	Added output_padding parameters in conv_transpose (#2092 )	2025-04-23 09:26:33 -07:00
Hyunsung Lee	3836445241	Add broadcast_shapes in python API (#2091 )	2025-04-22 18:57:39 -07:00
Yury Popov	1d2c9d6a07	Complex scan (#2094 )	2025-04-22 18:56:28 -07:00
Awni Hannun	e8ac6bd2f5	irfft throws instead of segfaults on scalars (#2109 )	2025-04-22 10:25:55 -07:00
Awni Hannun	fdadc4f22c	Add more complex unary ops (#2101 )	2025-04-21 13:04:54 -07:00
Awni Hannun	79b527f45f	conv vmap (#2102 )	2025-04-21 13:04:39 -07:00
Awni Hannun	dc4eada7f0	Use unordered map for kwargs in export/import (#2087 ) * use unordered map for kwargs in export/import * comment	2025-04-21 07:17:22 -07:00
Cheng	70ebc3b598	Return const ref in array::data_shared_ptr (#2100 )	2025-04-21 07:17:09 -07:00
Cheng	b13f2aed16	Introduce macros for dispatching dynamic dtypes as static types (#2073 )	2025-04-19 06:16:30 -07:00
Param Thakkar	5f04c0f818	Fixed shift operations issue (#2080 ) * Fixed shift operations issue * Added tests and fixes * Fixed loop syntax error * Added tests for bool * Fixed typo	2025-04-18 14:28:33 -07:00
Awni Hannun	55935ccae7	fix py gc edge case (#2079 )	2025-04-18 12:46:53 -07:00
Awni Hannun	b529515eb1	minor bump (#2081 )	2025-04-17 14:57:11 -07:00
Angelos Katharopoulos	3cde719eb7	Route to gather qmm only for many tokens per expert (#2082 )	2025-04-17 14:53:08 -07:00
Angelos Katharopoulos	5de6d94a90	Gather qmm batched kernel and refactoring of quantized (#2078 )	2025-04-17 13:53:11 -07:00
Angelos Katharopoulos	99eefd2ec0	Gather mm new kernel and small refactoring (#2040 )	2025-04-14 16:37:36 -07:00
Yury Popov	e9e268336b	LogCumSumExp (#2069 )	2025-04-13 01:27:29 -07:00
Awni Hannun	7275ac7523	Fix release build (#2072 )	2025-04-12 20:41:58 -07:00
Angelos Katharopoulos	c4189a38e4	Add float mask to sdpa vector (#2068 )	2025-04-11 17:29:40 -07:00
Awni Hannun	68d1b3256b	nit: fix exception handling (#2066 )	2025-04-11 14:12:08 -07:00
Awni Hannun	9c6953bda7	Fix stubgen (#2065 ) * Fix stubgen * add multi optim to docs	2025-04-11 12:02:54 -07:00
Awni Hannun	ef7ece9851	fix fft bug (#2062 )	2025-04-10 19:41:27 -07:00
Angelos Katharopoulos	ddaa4b7dcb	Fix the test and add custom min/max reductions for uncommon MPI types (#2060 )	2025-04-10 17:01:17 -07:00
Cheng	dfae2c6989	Fix MSVC build due to use of M_LN2 (#2058 )	2025-04-10 07:41:41 -07:00
Anastasiia Filippova	515f104926	Min / max reductions (#2041 )	2025-04-09 23:22:20 -07:00
Angelos Katharopoulos	9ecefd56db	Do not load the default lib if another is requested (#2055 )	2025-04-09 13:31:38 -07:00
Awni Hannun	e5d35aa187	no sdpa in grad (#2054 )	2025-04-08 19:13:54 -07:00
Awni Hannun	00794c42bc	Fix causal mask sdpa vec (#2053 ) * fix sdpa vector causal mask * test	2025-04-08 09:11:23 -07:00
Cheng	08a1bf3f10	Remove Event::Signal() (#2052 )	2025-04-08 06:20:27 -07:00
Awni Hannun	60c4154346	Only request residency once (#2051 )	2025-04-07 10:47:51 -07:00
Awni Hannun	f2c85308c1	add a half simd gemm fallback (#2046 ) * add a half simd gemm fallback * nit	2025-04-07 09:31:29 -07:00
Awni Hannun	1a28b69ee2	only add to residency set once (#2049 )	2025-04-06 17:38:25 -07:00
Cheng	ba09f01ce8	Remove test of converting negative float to uint (#2048 )	2025-04-06 06:21:46 -07:00
Cheng	6cf48872b7	wait_for_one should wait for task to finish (#2047 )	2025-04-05 20:05:16 -07:00
Angelos Katharopoulos	7b3b8fa000	Fix ci release (#2045 )	2025-04-04 20:25:01 -07:00
Awni Hannun	ec5e2aae61	nit in doc (#2044 )	2025-04-04 12:04:17 -07:00