auto build linux release (#2341 )

[CUDA] Do vectorized store/load in binary ops (#2330 )
Build with all cpu cores by default (#2336 )
2025-12-16 01:49:05 +08:00 · 2025-07-07 09:29:23 -07:00 · 2025-07-07 08:44:14 -07:00 · 2025-07-07 06:06:45 -07:00 · 2025-07-07 06:06:01 -07:00 · 2025-07-05 08:33:29 -07:00
75 changed files with 3266 additions and 1895 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -16,6 +16,9 @@ parameters:
  linux_release:
    type: boolean
    default: false
+  cuda_release:
+    type: boolean
+    default: false

 jobs:
  build_documentation:
@@ -38,7 +41,7 @@ jobs:
            pip install --upgrade pip
            pip install --upgrade cmake
            pip install -r docs/requirements.txt
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install . -v
+            pip install . -v
      - when:
          condition:
            not: << parameters.upload-docs >>
@@ -94,17 +97,15 @@ jobs:
          name: Install Python package
          command: |
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              python3 setup.py build_ext --inplace
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              python3 setup.py develop
      - run:
          name: Generate package stubs
          command: |
            echo "stubs"
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Run Python tests
          command: |
@@ -154,15 +155,14 @@ jobs:
          name: Install Python package
          command: |
            source env/bin/activate
-            DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-            CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
              pip install -e . -v
      - run:
          name: Generate package stubs
          command: |
            source env/bin/activate
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Run Python tests
          command: |
@@ -205,8 +205,7 @@ jobs:
          name: Run Python tests with JIT
          command: |
            source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
              pip install -e . -v
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
              METAL_DEBUG_ERROR_MODE=0 \
@@ -223,11 +222,9 @@ jobs:
          command: |
            sudo apt-get update
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
-            sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
            python -m venv env
            source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+            CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              pip install -e ".[dev]"
      - run:
          name: Run Python tests
@@ -276,21 +273,18 @@ jobs:
          command: |
            source env/bin/activate
            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
              pip install . -v
      - run:
          name: Generate package stubs
          command: |
            source env/bin/activate
            pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
      - run:
          name: Build Python package
          command: |
            source env/bin/activate
-            << parameters.build_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              python -m build -w
+            << parameters.build_env >> python -m build -w
      - when:
          condition: << parameters.build_env >>
          steps:
@@ -338,14 +332,10 @@ jobs:
            pip install patchelf
            pip install build
            pip install twine
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              pip install . -v
+            << parameters.extra_env >> pip install . -v
            pip install typing_extensions
-            python setup.py generate_stubs 
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python -m build --wheel
+            python setup.py generate_stubs
+            << parameters.extra_env >> python -m build --wheel
            auditwheel show dist/*
            auditwheel repair dist/* --plat manylinux_2_31_x86_64
      - run:
@@ -356,6 +346,46 @@ jobs:
      - store_artifacts:
          path: wheelhouse/

+  build_cuda_release:
+    parameters:
+      python_version:
+        type: string
+        default: "3.9"
+      extra_env:
+        type: string
+        default: "DEV_RELEASE=1"
+    machine:
+      image: linux-cuda-12:default
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - run:
+          name: Build wheel
+          command: |
+            sudo apt-get update
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            python -m venv env
+            source env/bin/activate
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            << parameters.extra_env >> \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              pip install ".[dev]" -v
+            python setup.py generate_stubs
+            << parameters.extra_env >> \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              python -m build --wheel
+            bash python/scripts/repair_cuda.sh
+      - run:
+          name: Upload package
+          command: |
+            source env/bin/activate
+            twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+
 workflows:
  build_and_test:
    when:
@@ -462,6 +492,16 @@ workflows:
            branches:
              ignore: /.*/
          upload-docs: true
+      - build_linux_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              extra_env: ["PYPI_RELEASE=1"]

  prb:
    when:
@@ -625,3 +665,14 @@ workflows:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
              extra_env: ["PYPI_RELEASE=1"]
+  cuda_test_release:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.cuda_release >>
+    jobs:
+      - build_cuda_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              extra_env: ["PYPI_RELEASE=1"]
--- a/benchmarks/python/comparative/bench_torch.py
+++ b/benchmarks/python/comparative/bench_torch.py
@@ -5,6 +5,7 @@ import os
 import time

 import torch
+import torch.cuda
 import torch.mps


@@ -44,8 +45,10 @@ def bench(f, *args):


 def sync_if_needed(x):
-    if x.device != torch.device("cpu"):
+    if x.device == torch.device("mps"):
        torch.mps.synchronize()
+    elif x.device == torch.device("cuda"):
+        torch.cuda.synchronize()


@torch.no_grad()
@@ -99,6 +102,14 @@ def reduction(op, axis, x):
    sync_if_needed(x)


+@torch.no_grad()
+def sum_and_add(axis, x, y):
+    z = x.sum(axis=axis, keepdims=True)
+    for i in range(50):
+        z = (z + y).sum(axis=axis, keepdims=True)
+    sync_if_needed(x)
+
+
@torch.no_grad()
 def softmax(axis, x):
    ys = []
@@ -340,7 +351,11 @@ if __name__ == "__main__":
        args.axis.pop(0)

    torch.set_num_threads(1)
-    device = "cpu" if args.cpu else "mps"
+    device = "mps"
+    if torch.cuda.is_available():
+        device = "cuda"
+    if args.cpu:
+        device = "cpu"

    types = args.dtype
    if not types:
@@ -460,5 +475,8 @@ if __name__ == "__main__":
    elif args.benchmark == "selu":
        print(bench(selu, x))

+    elif args.benchmark == "sum_and_add":
+        print(bench(sum_and_add, axis, *xs))
+
    else:
        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -30,6 +30,16 @@ MLX is also available on conda-forge. To install MLX with conda do:

   conda install conda-forge::mlx

+CUDA
+^^^^
+
+MLX has a CUDA backend which you can use on any Linux platform with CUDA 12
+and SM 7.0 (Volta) and up. To install MLX with CUDA support, run:
+
+.. code-block:: shell
+
+    pip install mlx-cuda
+

 Troubleshooting
 ^^^^^^^^^^^^^^^
@@ -65,6 +75,8 @@ Build Requirements
 Python API
 ^^^^^^^^^^

+.. _python install:
+
 To build and install the MLX python library from source, first, clone MLX from
 `its GitHub repo <https://github.com/ml-explore/mlx>`_:

@@ -76,20 +88,20 @@ Then simply build and install MLX using pip:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install .
+  pip install .

 For developing, install the package with development dependencies, and use an
 editable install:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -e ".[dev]"
+  pip install -e ".[dev]"

 Once the development dependencies are installed, you can build faster with:

 .. code-block:: shell

- CMAKE_BUILD_PARALLEL_LEVEL=8 python setup.py build_ext --inplace
+ python setup.py build_ext --inplace

 Run the tests with:

@@ -107,6 +119,8 @@ IDE:
 C++ API
 ^^^^^^^

+.. _cpp install:
+
 Currently, MLX must be built and installed from source.

 Similarly to the python library, to build and install the MLX C++ library start
@@ -185,6 +199,7 @@ should point to the path to the built metal library.

      xcrun -sdk macosx --show-sdk-version

+
 Binary Size Minimization
 ~~~~~~~~~~~~~~~~~~~~~~~~

@@ -213,6 +228,50 @@ be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
 Metal kernel cache persists across reboots.

+Linux
+^^^^^
+
+To build from source on Linux (CPU only), install the BLAS and LAPACK headers.
+For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   apt-get update -y
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+From here follow the instructions to install either the :ref:`Python <python
+install>` or :ref:`C++ <cpp install>` APIs.
+
+CUDA
+^^^^
+
+To build from source on Linux with CUDA, install the BLAS and LAPACK headers
+and the CUDA toolkit. For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+   dpkg -i cuda-keyring_1.1-1_all.deb
+   apt-get update -y
+   apt-get -y install cuda-toolkit-12-9
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+
+When building either the Python or C++ APIs make sure to pass the cmake flag
+``MLX_BUILD_CUDA=ON``. For example, to build the Python API run:
+
+.. code-block:: shell
+
+  CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"
+
+To build the C++ package run:
+
+.. code-block:: shell
+
+   mkdir -p build && cd build
+   cmake .. -DMLX_BUILD_CUDA=ON && make -j
+
+
 Troubleshooting
 ^^^^^^^^^^^^^^^

--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -14,6 +14,8 @@ void print_constant(std::ostream& os, const array& x) {
      return print_float_constant<float16_t>(os, x);
    case bfloat16:
      return print_float_constant<bfloat16_t>(os, x);
+    case float64:
+      return print_float_constant<double>(os, x);
    case complex64:
      return print_complex_constant<complex64_t>(os, x);
    case int8:
@@ -50,6 +52,8 @@ std::string get_type_string(Dtype d) {
      return "float16_t";
    case bfloat16:
      return "bfloat16_t";
+    case float64:
+      return "double";
    case complex64:
      return "complex64_t";
    case bool_:
--- a/mlx/backend/common/compiled.h
+++ b/mlx/backend/common/compiled.h
@@ -18,8 +18,12 @@ std::string get_type_string(Dtype d);
 template <typename T>
 void print_float_constant(std::ostream& os, const array& x) {
  auto old_precision = os.precision();
-  os << std::setprecision(std::numeric_limits<float>::digits10 + 1)
-     << x.item<T>() << std::setprecision(old_precision);
+  if constexpr (std::is_same_v<T, double>) {
+    os << std::setprecision(std::numeric_limits<double>::digits10 + 1);
+  } else {
+    os << std::setprecision(std::numeric_limits<float>::digits10 + 1);
+  }
+  os << x.item<T>() << std::setprecision(old_precision);
 }

 template <typename T>
--- a/mlx/backend/common/matmul.h
+++ b/mlx/backend/common/matmul.h
@@ -12,16 +12,11 @@ namespace mlx::core {
 inline std::tuple<Shape, Strides, Strides> collapse_batches(
    const array& a,
    const array& b) {
-  // Get and check the shape for the batched dims
-  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
-  Shape B_bshape{b.shape().begin(), b.shape().end() - 2};
-  if (A_bshape != B_bshape) {
-    std::ostringstream msg;
-    msg << "[matmul] Got matrices with incorrectly broadcasted shapes: " << "A "
-        << a.shape() << ", B " << b.shape() << ".";
-    throw std::runtime_error(msg.str());
+  if (a.ndim() == 2) {
+    return {{1}, {0}, {0}};
  }

+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};

@@ -42,17 +37,11 @@ inline std::tuple<Shape, Strides, Strides> collapse_batches(

 inline std::tuple<Shape, Strides, Strides, Strides>
 collapse_batches(const array& a, const array& b, const array& c) {
-  // Get and check the shape for the batched dims
-  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
-  Shape B_bshape{b.shape().begin(), b.shape().end() - 2};
-  Shape C_bshape{c.shape().begin(), c.shape().end() - 2};
-  if (A_bshape != B_bshape || A_bshape != C_bshape) {
-    std::ostringstream msg;
-    msg << "[addmm] Got matrices with incorrectly broadcasted shapes: " << "A "
-        << a.shape() << ", B " << b.shape() << ", B " << c.shape() << ".";
-    throw std::runtime_error(msg.str());
+  if (a.ndim() == 2) {
+    return {{1}, {0}, {0}, {0}};
  }

+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
  Strides C_bstride{c.strides().begin(), c.strides().end() - 2};
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -5,11 +5,9 @@
 namespace mlx::core {

 std::pair<Shape, Strides> shapes_without_reduction_axes(
-    const array& x,
+    Shape shape,
+    Strides strides,
    const std::vector<int>& axes) {
-  auto shape = x.shape();
-  auto strides = x.strides();
-
  for (int i = axes.size() - 1; i >= 0; i--) {
    int a = axes[i];
    shape.erase(shape.begin() + a);
@@ -19,6 +17,15 @@ std::pair<Shape, Strides> shapes_without_reduction_axes(
  return std::make_pair(shape, strides);
 }

+std::pair<Shape, Strides> shapes_without_reduction_axes(
+    const array& x,
+    const std::vector<int>& axes) {
+  auto shape = x.shape();
+  auto strides = x.strides();
+  return shapes_without_reduction_axes(
+      std::move(shape), std::move(strides), axes);
+}
+
 ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // The data is all there and we are reducing over everything
  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -51,5 +51,9 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);
 std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes);
+std::pair<Shape, Strides> shapes_without_reduction_axes(
+    Shape shape,
+    Strides strides,
+    const std::vector<int>& axes);

 } // namespace mlx::core
--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@@ -199,12 +199,15 @@ Dims get_2d_grid_dims_common(
      }
    }
  }
-  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX || divisor > 1) {
+  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
    throw std::runtime_error("Unable to safely factor shape.");
  }
  if (grid_y > grid_x) {
    std::swap(grid_x, grid_y);
  }
+  if (divisor > 1) {
+    grid_x = ((grid_x + divisor - 1) / divisor) * divisor;
+  }
  return std::make_tuple(
      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
 }
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -8,6 +8,7 @@ target_sources(
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_contiguous.cu
@@ -28,9 +29,10 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/random.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/all_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/col_reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/init_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/row_reduce.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/segmented_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -3,6 +3,7 @@
 #include "mlx/backend/cuda/allocator.h"
 #include "mlx/backend/cuda/utils.h"
 #include "mlx/backend/cuda/worker.h"
+#include "mlx/utils.h"

 #include <cuda_runtime.h>
 #include <fmt/format.h>
@@ -14,9 +15,11 @@ namespace mlx::core {

 namespace cu {

+constexpr int page_size = 16384;
+
 CudaAllocator::CudaAllocator()
    : buffer_cache_(
-          getpagesize(),
+          page_size,
          [](CudaBuffer* buf) { return buf->size; },
          [this](CudaBuffer* buf) {
            cuda_free(buf->data);
@@ -31,7 +34,14 @@ CudaAllocator::CudaAllocator()

 Buffer CudaAllocator::malloc(size_t size) {
  // Find available buffer from cache.
+  auto orig_size = size;
  std::unique_lock lock(mutex_);
+  if (size < page_size) {
+    size = next_power_of_2(size);
+  } else {
+    size = page_size * ((size + page_size - 1) / page_size);
+  }
+
  CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
  if (!buf) {
    // If we have a lot of memory pressure or are over the maximum cache size,
@@ -106,7 +116,6 @@ void CudaAllocator::cuda_free(void* buf) {
      return;
    }
  }
-
  cudaFree(buf);
 }

--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -151,36 +151,29 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_REAL_TYPES_CHECKED(in.dtype(), "ArgReduce", CTYPE, {
-      using InType = cuda_type_t<CTYPE>;
-      constexpr uint32_t N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
-        dim3 block_dims{BLOCK_DIM, 1, 1};
-        auto kernel = &cu::arg_reduce_general<
-            InType,
-            cu::ArgMax<InType>,
-            BLOCK_DIM,
-            N_READS>;
-        if (reduce_type_ == ArgReduce::ArgMin) {
-          kernel = &cu::arg_reduce_general<
-              InType,
-              cu::ArgMin<InType>,
-              BLOCK_DIM,
-              N_READS>;
-        }
-        kernel<<<num_blocks, block_dims, 0, stream>>>(
-            in.data<InType>(),
-            out.data<uint32_t>(),
-            out.size(),
-            const_param(shape),
-            const_param(in_strides),
-            const_param(out_strides),
-            ndim,
-            axis_stride,
-            axis_size);
-      });
+  dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
+    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    constexpr uint32_t N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
+      auto kernel =
+          cu::arg_reduce_general<T, cu::ArgMax<T>, block_dim(), N_READS>;
+      if (reduce_type_ == ArgReduce::ArgMin) {
+        kernel = cu::arg_reduce_general<T, cu::ArgMin<T>, block_dim(), N_READS>;
+      }
+      encoder.add_kernel_node(
+          kernel,
+          num_blocks,
+          block_dim(),
+          in.data<T>(),
+          out.data<uint32_t>(),
+          out.size(),
+          const_param(shape),
+          const_param(in_strides),
+          const_param(out_strides),
+          ndim,
+          axis_stride,
+          axis_size);
    });
  });
 }
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -17,35 +17,106 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_ss(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[0], b[0]);
+  int remaining = size - index * N_READS;
+  if (remaining <= 0) {
+    return;
+  }
+
+  if (remaining < N_READS) {
+    for (int i = 0; i < remaining; ++i) {
+      IdxT offset = index * N_READS + i;
+      out[offset] = Op{}(a[0], b[0]);
+    }
+  } else {
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a[0], b[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_sv(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[0], b[index]);
+  int remaining = size - index * N_READS;
+  if (remaining <= 0) {
+    return;
+  }
+
+  if (remaining < N_READS) {
+    for (int i = 0; i < remaining; ++i) {
+      IdxT offset = index * N_READS + i;
+      out[offset] = Op{}(a[0], b[offset]);
+    }
+  } else {
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a[0], b_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_vs(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[index], b[0]);
+  int remaining = size - index * N_READS;
+  if (remaining <= 0) {
+    return;
+  }
+
+  if (remaining < N_READS) {
+    for (int i = 0; i < remaining; ++i) {
+      IdxT offset = index * N_READS + i;
+      out[offset] = Op{}(a[offset], b[0]);
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a_vec.val[i], b[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_vv(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[index], b[index]);
+  int remaining = size - index * N_READS;
+  if (remaining <= 0) {
+    return;
+  }
+
+  if (remaining < N_READS) {
+    for (int i = 0; i < remaining; ++i) {
+      IdxT offset = index * N_READS + i;
+      out[offset] = Op{}(a[offset], b[offset]);
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a_vec.val[i], b_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

@@ -125,13 +196,12 @@ constexpr bool supports_binary_op() {
 template <typename Op>
 void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
-    std::vector<array>& outputs,
+    array& out,
    std::string_view op,
    const Stream& s) {
  assert(inputs.size() > 1);
  const auto& a = inputs[0];
  const auto& b = inputs[1];
-  auto& out = outputs[0];
  if (out.size() == 0) {
    return;
  }
@@ -140,99 +210,103 @@ void binary_op_gpu_inplace(
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(a.dtype(), CTYPE_IN, {
-      MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {
-        if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
-          using InType = cuda_type_t<CTYPE_IN>;
-          using OutType = cuda_type_t<CTYPE_OUT>;
-
-          auto bopt = get_binary_op_type(a, b);
-          if (bopt == BinaryOpType::General) {
-            auto [shape, strides] = collapse_contiguous_dims(a, b, out);
-            auto& a_strides = strides[0];
-            auto& b_strides = strides[1];
-            bool large = a.data_size() > INT32_MAX ||
-                b.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-            MLX_SWITCH_BOOL(large, LARGE, {
-              using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-              int ndim = shape.size();
-              if (ndim <= 3) {
-                MLX_SWITCH_1_2_3(ndim, NDIM, {
-                  auto kernel =
-                      &cu::binary_g_nd<Op, InType, OutType, IdxT, NDIM>;
+  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+      if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+        using InType = cuda_type_t<CTYPE_IN>;
+        using OutType = cuda_type_t<CTYPE_OUT>;
+        auto bopt = get_binary_op_type(a, b);
+        if (bopt == BinaryOpType::General) {
+          dispatch_bool(
+              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                  out.data_size() > INT32_MAX,
+              [&](auto large) {
+                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                Shape shape;
+                std::vector<Strides> strides;
+                std::tie(shape, strides) = collapse_contiguous_dims(a, b, out);
+                auto& a_strides = strides[0];
+                auto& b_strides = strides[1];
+                int ndim = shape.size();
+                if (ndim <= 3) {
+                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                    auto kernel = cu::
+                        binary_g_nd<Op, InType, OutType, IdxT, dims_constant()>;
+                    auto [num_blocks, block_dims] =
+                        get_launch_args(kernel, out, large());
+                    encoder.add_kernel_node(
+                        kernel,
+                        num_blocks,
+                        block_dims,
+                        a.data<InType>(),
+                        b.data<InType>(),
+                        out.data<OutType>(),
+                        out.size(),
+                        const_param<dims_constant()>(shape),
+                        const_param<dims_constant()>(a_strides),
+                        const_param<dims_constant()>(b_strides));
+                  });
+                } else {
+                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out, large);
-                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      get_launch_args(kernel, out, large());
+                  encoder.add_kernel_node(
+                      kernel,
+                      num_blocks,
+                      block_dims,
                      a.data<InType>(),
                      b.data<InType>(),
                      out.data<OutType>(),
                      out.size(),
-                      const_param<NDIM>(shape),
-                      const_param<NDIM>(a_strides),
-                      const_param<NDIM>(b_strides));
-                });
-              } else {
-                auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
-                auto [num_blocks, block_dims] =
-                    get_launch_args(kernel, out, large);
-                kernel<<<num_blocks, block_dims, 0, stream>>>(
-                    a.data<InType>(),
-                    b.data<InType>(),
-                    out.data<OutType>(),
-                    out.size(),
-                    const_param(shape),
-                    const_param(a_strides),
-                    const_param(b_strides),
-                    ndim);
-              }
-            });
-          } else {
-            MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
-              using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
-              auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
-              if (bopt == BinaryOpType::ScalarVector) {
-                kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
-              } else if (bopt == BinaryOpType::VectorScalar) {
-                kernel = cu::binary_vs<Op, InType, OutType, IdxT>;
-              } else if (bopt == BinaryOpType::VectorVector) {
-                kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
-              }
-              auto [num_blocks, block_dims] = get_launch_args(
-                  kernel, out.data_size(), out.shape(), out.strides(), LARGE);
-              kernel<<<num_blocks, block_dims, 0, stream>>>(
-                  a.data<InType>(),
-                  b.data<InType>(),
-                  out.data<OutType>(),
-                  out.data_size());
-            });
-          }
+                      const_param(shape),
+                      const_param(a_strides),
+                      const_param(b_strides),
+                      ndim);
+                }
+              });
        } else {
-          throw std::runtime_error(fmt::format(
-              "Can not do binary op {} on inputs of {} with result of {}.",
-              op,
-              dtype_to_string(a.dtype()),
-              dtype_to_string(out.dtype())));
+          dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+            // TODO: Choose optimized value based on type size.
+            constexpr int N_READS = 4;
+            auto kernel = cu::binary_ss<Op, InType, OutType, IdxT, N_READS>;
+            if (bopt == BinaryOpType::ScalarVector) {
+              kernel = cu::binary_sv<Op, InType, OutType, IdxT, N_READS>;
+            } else if (bopt == BinaryOpType::VectorScalar) {
+              kernel = cu::binary_vs<Op, InType, OutType, IdxT, N_READS>;
+            } else if (bopt == BinaryOpType::VectorVector) {
+              kernel = cu::binary_vv<Op, InType, OutType, IdxT, N_READS>;
+            }
+            auto [num_blocks, block_dims] = get_launch_args(
+                kernel,
+                out.data_size(),
+                out.shape(),
+                out.strides(),
+                large(),
+                N_READS);
+            encoder.add_kernel_node(
+                kernel,
+                num_blocks,
+                block_dims,
+                a.data<InType>(),
+                b.data<InType>(),
+                out.data<OutType>(),
+                out.data_size());
+          });
        }
-      });
+      } else {
+        throw std::runtime_error(fmt::format(
+            "Can not do binary op {} on inputs of {} with result of {}.",
+            op,
+            dtype_to_string(a.dtype()),
+            dtype_to_string(out.dtype())));
+      }
    });
  });
 }

-template <typename Op>
-void binary_op_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs,
-    std::string_view op,
-    const Stream& s) {
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, outputs[0], bopt);
-  set_binary_op_output_data(a, b, outputs[1], bopt);
-  binary_op_gpu_inplace<Op>(inputs, outputs, op, s);
-}
-
 template <typename Op>
 void binary_op_gpu(
    const std::vector<array>& inputs,
@@ -243,8 +317,7 @@ void binary_op_gpu(
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);
-  std::vector<array> outputs{out};
-  binary_op_gpu_inplace<Op>(inputs, outputs, op, s);
+  binary_op_gpu_inplace<Op>(inputs, out, op, s);
 }

 #define BINARY_GPU(func)                                                 \
@@ -254,14 +327,6 @@ void binary_op_gpu(
    binary_op_gpu<cu::func>(inputs, out, get_primitive_string(this), s); \
  }

-#define BINARY_GPU_MULTI(func)                                               \
-  void func::eval_gpu(                                                       \
-      const std::vector<array>& inputs, std::vector<array>& outputs) {       \
-    nvtx3::scoped_range r(#func "::eval_gpu");                               \
-    auto& s = outputs[0].primitive().stream();                               \
-    binary_op_gpu<cu::func>(inputs, outputs, get_primitive_string(this), s); \
-  }
-
 BINARY_GPU(Add)
 BINARY_GPU(ArcTan2)
 BINARY_GPU(Divide)
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -0,0 +1,261 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/binary_ops.cuh"
+#include "mlx/backend/cuda/device/cucomplex_math.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <cooperative_groups.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void
+binary_ss(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto out = Op{}(a[0], b[0]);
+    out_a[0] = out[0];
+    out_b[0] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void
+binary_sv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto out = Op{}(a[0], b[index]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void
+binary_vs(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto out = Op{}(a[index], b[0]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void
+binary_vv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto out = Op{}(a[index], b[index]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
+__global__ void binary_g_nd(
+    const In* a,
+    const In* b,
+    Out* out_a,
+    Out* out_b,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), a_strides.data(), b_strides.data());
+    auto out = Op{}(a[a_idx], b[b_idx]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void binary_g(
+    const In* a,
+    const In* b,
+    Out* out_a,
+    Out* out_b,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides a_strides,
+    const __grid_constant__ Strides b_strides,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [a_idx, b_idx] = elem_to_loc_4d(
+        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
+    auto out = Op{}(a[a_idx], b[b_idx]);
+    out_a[index] = out[0];
+    out_b[index] = out[1];
+  }
+}
+
+template <typename Op, typename In, typename Out>
+constexpr bool supports_binary_op() {
+  if (std::is_same_v<Op, DivMod>) {
+    return std::is_same_v<In, Out> &&
+        (std::is_integral_v<Out> || is_floating_v<Out>);
+  }
+  return false;
+}
+
+} // namespace cu
+
+template <typename Op>
+void binary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    std::string_view op,
+    const Stream& s) {
+  assert(inputs.size() > 1);
+  const auto& a = inputs[0];
+  const auto& b = inputs[1];
+  auto& out_a = outputs[0];
+  auto& out_b = outputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out_a, bopt);
+  set_binary_op_output_data(a, b, out_b, bopt);
+
+  if (out_a.size() == 0) {
+    return;
+  }
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out_a);
+  encoder.set_output_array(out_b);
+  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out_a.dtype(), [&](auto out_type_tag) {
+      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+      if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+        using InType = cuda_type_t<CTYPE_IN>;
+        using OutType = cuda_type_t<CTYPE_OUT>;
+
+        auto bopt = get_binary_op_type(a, b);
+        if (bopt == BinaryOpType::General) {
+          dispatch_bool(
+              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                  out_a.data_size() > INT32_MAX,
+              [&](auto large) {
+                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                Shape shape;
+                std::vector<Strides> strides;
+                std::tie(shape, strides) =
+                    collapse_contiguous_dims(a, b, out_a);
+                auto& a_strides = strides[0];
+                auto& b_strides = strides[1];
+                int ndim = shape.size();
+                if (ndim <= 3) {
+                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                    auto kernel = cu::
+                        binary_g_nd<Op, InType, OutType, IdxT, dims_constant()>;
+                    auto [num_blocks, block_dims] =
+                        get_launch_args(kernel, out_a, large());
+                    encoder.add_kernel_node(
+                        kernel,
+                        num_blocks,
+                        block_dims,
+                        a.data<InType>(),
+                        b.data<InType>(),
+                        out_a.data<OutType>(),
+                        out_b.data<OutType>(),
+                        out_a.size(),
+                        const_param<dims_constant()>(shape),
+                        const_param<dims_constant()>(a_strides),
+                        const_param<dims_constant()>(b_strides));
+                  });
+                } else {
+                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out_a, large());
+                  encoder.add_kernel_node(
+                      kernel,
+                      num_blocks,
+                      block_dims,
+                      a.data<InType>(),
+                      b.data<InType>(),
+                      out_a.data<OutType>(),
+                      out_b.data<OutType>(),
+                      out_a.size(),
+                      const_param(shape),
+                      const_param(a_strides),
+                      const_param(b_strides),
+                      ndim);
+                }
+              });
+        } else {
+          dispatch_bool(out_a.data_size() > INT32_MAX, [&](auto large) {
+            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+            auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
+            if (bopt == BinaryOpType::ScalarVector) {
+              kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
+            } else if (bopt == BinaryOpType::VectorScalar) {
+              kernel = cu::binary_vs<Op, InType, OutType, IdxT>;
+            } else if (bopt == BinaryOpType::VectorVector) {
+              kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
+            }
+            auto [num_blocks, block_dims] = get_launch_args(
+                kernel,
+                out_a.data_size(),
+                out_a.shape(),
+                out_a.strides(),
+                large());
+            encoder.add_kernel_node(
+                kernel,
+                num_blocks,
+                block_dims,
+                a.data<InType>(),
+                b.data<InType>(),
+                out_a.data<OutType>(),
+                out_b.data<OutType>(),
+                out_a.data_size());
+          });
+        }
+      } else {
+        throw std::runtime_error(fmt::format(
+            "Can not do binary op {} on inputs of {} with result of {}.",
+            op,
+            dtype_to_string(a.dtype()),
+            dtype_to_string(out_a.dtype())));
+      }
+    });
+  });
+}
+
+template <typename Op>
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    std::string_view op,
+    const Stream& s) {
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, outputs[0], bopt);
+  set_binary_op_output_data(a, b, outputs[1], bopt);
+  binary_op_gpu_inplace<Op>(inputs, outputs, op, s);
+}
+
+void DivMod::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  nvtx3::scoped_range r("DivMod::eval_gpu");
+  auto& s = outputs[0].primitive().stream();
+  binary_op_gpu<cu::DivMod>(inputs, outputs, get_primitive_string(this), s);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -3,6 +3,7 @@
 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/jit_module.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/graph_utils.h"
 #include "mlx/primitives.h"

@@ -178,6 +179,7 @@ void Compiled::eval_gpu(
  // Whether to use large index.
  bool large = compiled_use_large_index(inputs, outputs, contiguous);

+  cu::KernelArgs args;
  // Put inputs.
  int strides_index = 1;
  for (size_t i = 0; i < inputs.size(); ++i) {
@@ -185,26 +187,26 @@ void Compiled::eval_gpu(
      continue;
    }
    const auto& x = inputs[i];
-    mod.append_arg(x);
+    args.append(x);
    if (!contiguous && !is_scalar(x)) {
-      mod.append_arg(strides_vec[strides_index++]);
+      args.append_ptr(strides_vec[strides_index++].data());
    }
  }

  // Put outputs.
  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);
  for (auto& x : outputs) {
-    mod.append_arg(x);
+    args.append(x);
  }

  // Put shape and size.
  if (!contiguous) {
-    mod.append_arg(shape);
+    args.append_ptr(shape.data());
  }
  if (large) {
-    mod.append_arg<int64_t>(outputs[0].data_size());
+    args.append<int64_t>(outputs[0].data_size());
  } else {
-    mod.append_arg<uint32_t>(outputs[0].data_size());
+    args.append<uint32_t>(outputs[0].data_size());
  }

  // Launch kernel.
@@ -222,9 +224,10 @@ void Compiled::eval_gpu(
  for (const auto& out : outputs) {
    encoder.set_output_array(out);
  }
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    mod.launch_kernel(stream, kernel_name, outputs[0], large);
-  });
+
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, outputs[0], large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -24,7 +24,6 @@ void copy_gpu_inplace(
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-
  if (ctype == CopyType::Scalar || ctype == CopyType::Vector) {
    copy_contiguous(encoder, ctype, in, out, offset_in, offset_out);
    return;
--- a/mlx/backend/cuda/copy/copy.cuh
+++ b/mlx/backend/cuda/copy/copy.cuh
@@ -10,15 +10,6 @@

 namespace mlx::core {

-#define MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, ...) \
-  MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE_IN, {               \
-    MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {           \
-      using InType = cuda_type_t<CTYPE_IN>;                  \
-      using OutType = cuda_type_t<CTYPE_OUT>;                \
-      __VA_ARGS__;                                           \
-    });                                                      \
-  })
-
 void copy_contiguous(
    cu::CommandEncoder& encoder,
    CopyType ctype,
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -35,17 +35,22 @@ void copy_contiguous(
    array& out,
    int64_t in_offset,
    int64_t out_offset) {
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
-      MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
-        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
+        using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+        using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
        auto kernel = cu::copy_s<InType, OutType, IdxT>;
        if (ctype == CopyType::Vector) {
          kernel = cu::copy_v<InType, OutType, IdxT>;
        }
        auto [num_blocks, block_dims] = get_launch_args(
-            kernel, out.data_size(), out.shape(), out.strides(), LARGE);
-        kernel<<<num_blocks, block_dims, 0, stream>>>(
+            kernel, out.data_size(), out.shape(), out.strides(), large());
+        encoder.add_kernel_node(
+            kernel,
+            num_blocks,
+            block_dims,
            in.data<InType>() + in_offset,
            out.data<OutType>() + out_offset,
            out.data_size());
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -55,39 +55,54 @@ void copy_general(
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out) {
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
-      const InType* in_ptr = in.data<InType>() + offset_in;
-      OutType* out_ptr = out.data<OutType>() + offset_out;
-      bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-      MLX_SWITCH_BOOL(large, LARGE, {
-        using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-        int ndim = shape.size();
-        if (ndim <= 3) {
-          MLX_SWITCH_1_2_3(ndim, NDIM, {
-            auto kernel = cu::copy_gg_nd<InType, OutType, IdxT, NDIM>;
-            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-            kernel<<<num_blocks, block_dims, 0, stream>>>(
-                in_ptr,
-                out_ptr,
-                out.size(),
-                const_param<NDIM>(shape),
-                const_param<NDIM>(strides_in),
-                const_param<NDIM>(strides_out));
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(
+          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
+            int ndim = shape.size();
+            size_t data_size = 1;
+            for (auto& s : shape)
+              data_size *= s;
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto ndim_constant) {
+                auto kernel =
+                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>;
+                auto [num_blocks, block_dims] = get_launch_args(
+                    kernel, data_size, shape, out.strides(), large());
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
+                    in_ptr,
+                    out_ptr,
+                    data_size,
+                    const_param<ndim_constant()>(shape),
+                    const_param<ndim_constant()>(strides_in),
+                    const_param<ndim_constant()>(strides_out));
+              });
+            } else { // ndim >= 4
+              auto kernel = cu::copy_gg<InType, OutType, IdxT>;
+              auto [num_blocks, block_dims] = get_launch_args(
+                  kernel, data_size, shape, out.strides(), large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  in_ptr,
+                  out_ptr,
+                  data_size,
+                  const_param(shape),
+                  const_param(strides_in),
+                  const_param(strides_out),
+                  ndim);
+            }
          });
-        } else { // ndim >= 4
-          auto kernel = cu::copy_gg<InType, OutType, IdxT>;
-          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-          kernel<<<num_blocks, block_dims, 0, stream>>>(
-              in_ptr,
-              out_ptr,
-              out.size(),
-              const_param(shape),
-              const_param(strides_in),
-              const_param(strides_out),
-              ndim);
-        }
-      });
    });
  });
 }
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -61,43 +61,55 @@ void copy_general_dynamic(
    const Strides& strides_out,
    const array& dynamic_offset_in,
    const array& dynamic_offset_out) {
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
-      const InType* in_ptr = in.data<InType>() + offset_in;
-      OutType* out_ptr = out.data<OutType>() + offset_out;
-      bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-      MLX_SWITCH_BOOL(large, LARGE, {
-        using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-        int ndim = shape.size();
-        if (ndim <= 3) {
-          MLX_SWITCH_1_2_3(ndim, NDIM, {
-            auto kernel = cu::copy_gg_dynamic_nd<InType, OutType, IdxT, NDIM>;
-            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-            kernel<<<num_blocks, block_dims, 0, stream>>>(
-                in_ptr,
-                out_ptr,
-                out.size(),
-                const_param<NDIM>(shape),
-                const_param<NDIM>(strides_in),
-                const_param<NDIM>(strides_out),
-                dynamic_offset_in.data<int64_t>(),
-                dynamic_offset_out.data<int64_t>());
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(
+          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
+            int ndim = shape.size();
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                auto kernel = cu::
+                    copy_gg_dynamic_nd<InType, OutType, IdxT, dims_constant()>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out, large());
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
+                    in_ptr,
+                    out_ptr,
+                    out.size(),
+                    const_param<dims_constant()>(shape),
+                    const_param<dims_constant()>(strides_in),
+                    const_param<dims_constant()>(strides_out),
+                    dynamic_offset_in.data<int64_t>(),
+                    dynamic_offset_out.data<int64_t>());
+              });
+            } else { // ndim >= 4
+              auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
+              auto [num_blocks, block_dims] =
+                  get_launch_args(kernel, out, large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  in_ptr,
+                  out_ptr,
+                  out.size(),
+                  const_param(shape),
+                  const_param(strides_in),
+                  const_param(strides_out),
+                  ndim,
+                  dynamic_offset_in.data<int64_t>(),
+                  dynamic_offset_out.data<int64_t>());
+            }
          });
-        } else { // ndim >= 4
-          auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
-          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-          kernel<<<num_blocks, block_dims, 0, stream>>>(
-              in_ptr,
-              out_ptr,
-              out.size(),
-              const_param(shape),
-              const_param(strides_in),
-              const_param(strides_out),
-              ndim,
-              dynamic_offset_in.data<int64_t>(),
-              dynamic_offset_out.data<int64_t>());
-        }
-      });
    });
  });
 }
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -50,37 +50,49 @@ void copy_general_input(
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in) {
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
-      const InType* in_ptr = in.data<InType>() + offset_in;
-      OutType* out_ptr = out.data<OutType>() + offset_out;
-      bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-      MLX_SWITCH_BOOL(large, LARGE, {
-        using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-        int ndim = shape.size();
-        if (ndim <= 3) {
-          MLX_SWITCH_1_2_3(ndim, NDIM, {
-            auto kernel = cu::copy_g_nd<InType, OutType, IdxT, NDIM>;
-            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-            kernel<<<num_blocks, block_dims, 0, stream>>>(
-                in_ptr,
-                out_ptr,
-                out.size(),
-                const_param<NDIM>(shape),
-                const_param<NDIM>(strides_in));
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(
+          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
+            int ndim = shape.size();
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                auto kernel =
+                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out, large());
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
+                    in_ptr,
+                    out_ptr,
+                    out.size(),
+                    const_param<dims_constant()>(shape),
+                    const_param<dims_constant()>(strides_in));
+              });
+            } else { // ndim >= 4
+              auto kernel = cu::copy_g<InType, OutType, IdxT>;
+              auto [num_blocks, block_dims] =
+                  get_launch_args(kernel, out, large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  in_ptr,
+                  out_ptr,
+                  out.size(),
+                  const_param(shape),
+                  const_param(strides_in),
+                  ndim);
+            }
          });
-        } else { // ndim >= 4
-          auto kernel = cu::copy_g<InType, OutType, IdxT>;
-          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-          kernel<<<num_blocks, block_dims, 0, stream>>>(
-              in_ptr,
-              out_ptr,
-              out.size(),
-              const_param(shape),
-              const_param(strides_in),
-              ndim);
-        }
-      });
    });
  });
 }
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -2,37 +2,28 @@

 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/worker.h"
-#include "mlx/backend/metal/metal.h"
+#include "mlx/utils.h"

 #include <fmt/format.h>
 #include <nvtx3/nvtx3.hpp>
+#include <future>
+#include <unordered_set>

 namespace mlx::core {

+// Can be tuned with MLX_MAX_OPS_PER_BUFFER
+// This should be less than 255
+constexpr int default_max_nodes_per_graph = 20;
+
+int cuda_graph_cache_size() {
+  static int cache_size = []() {
+    return env::get_var("MLX_CUDA_GRAPH_CACHE_SIZE", 100);
+  }();
+  return cache_size;
+}
+
 namespace cu {

-DeviceStream::DeviceStream(Device& device) : device_(device), stream_(device) {}
-
-void DeviceStream::synchronize() {
-  cudaStreamSynchronize(stream_);
-}
-
-cudaStream_t DeviceStream::schedule_cuda_stream() {
-  // TODO: Return a stream that maximizes parallelism.
-  return stream_;
-}
-
-cudaStream_t DeviceStream::last_cuda_stream() {
-  return stream_;
-}
-
-CommandEncoder& DeviceStream::get_encoder() {
-  if (!encoder_) {
-    encoder_ = std::make_unique<CommandEncoder>(*this);
-  }
-  return *encoder_;
-}
-
 Device::Device(int device) : device_(device) {
  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
      &compute_capability_major_, cudaDevAttrComputeCapabilityMajor, device_));
@@ -66,45 +57,260 @@ void Device::make_current() {
  }
 }

-DeviceStream& Device::get_stream(Stream s) {
-  auto it = streams_.find(s.index);
-  if (it == streams_.end()) {
-    it = streams_.try_emplace(s.index, *this).first;
+CommandEncoder::CaptureContext::CaptureContext(CommandEncoder& enc) : enc(enc) {
+  CHECK_CUDA_ERROR(cudaGraphCreate(&graph, 0));
+  CHECK_CUDA_ERROR(
+      cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeGlobal));
+}
+
+CommandEncoder::CaptureContext::~CaptureContext() {
+  CHECK_CUDA_ERROR(cudaStreamEndCapture(enc.stream(), &graph));
+  size_t num_nodes;
+  CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, NULL, &num_nodes));
+  if (num_nodes == 1) {
+    cudaGraphNode_t captured_node;
+    CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, &captured_node, &num_nodes));
+    CUDA_KERNEL_NODE_PARAMS params;
+    CHECK_CUDA_ERROR(cuGraphKernelNodeGetParams(captured_node, &params));
+    cudaGraphNode_t node;
+    CHECK_CUDA_ERROR(cuGraphAddKernelNode(&node, enc.graph_, NULL, 0, &params));
+    enc.insert_graph_dependencies(GraphNode{node, 'K'});
+  } else {
+    cudaGraphNode_t node;
+    CHECK_CUDA_ERROR(
+        cudaGraphAddChildGraphNode(&node, enc.graph_, NULL, 0, graph));
+    enc.insert_graph_dependencies(GraphNode{node, 'G'});
+  }
+  CHECK_CUDA_ERROR(cudaGraphDestroy(graph));
+}
+
+CommandEncoder::ConcurrentContext::ConcurrentContext(CommandEncoder& enc)
+    : enc(enc) {
+  enc.in_concurrent_ = true;
+}
+
+CommandEncoder::ConcurrentContext::~ConcurrentContext() {
+  enc.in_concurrent_ = false;
+
+  // Use an empty graph node for synchronization
+  CommandEncoder::GraphNode empty{NULL, 'E', std::to_string(enc.node_count_++)};
+  enc.empty_node_count_++;
+  CHECK_CUDA_ERROR(cudaGraphAddEmptyNode(&empty.node, enc.graph_, NULL, 0));
+
+  // Insert the concurrent -> empty node dependencies
+  for (auto& from : enc.concurrent_nodes_) {
+    enc.from_nodes_.push_back(from.node);
+    enc.to_nodes_.push_back(empty.node);
+    enc.graph_key_ += from.id;
+    enc.graph_key_ += from.node_type;
+    enc.graph_key_ += empty.id;
+    enc.graph_key_ += empty.node_type;
+  }
+
+  // Insert the input -> concurrent node dependencies without updating output
+  // nodes
+  auto outputs = std::move(enc.active_outputs_);
+  enc.insert_graph_dependencies(std::move(enc.concurrent_nodes_));
+
+  // Update output node to be the empty node
+  for (auto o : outputs) {
+    enc.node_map_.emplace(o, empty).first->second = empty;
+  }
+}
+
+void CommandEncoder::insert_graph_dependencies(GraphNode node) {
+  if (node.node_type == 'G') {
+    graph_node_count_++;
+  }
+  node.id = std::to_string(node_count_++);
+  if (in_concurrent_) {
+    concurrent_nodes_.push_back(std::move(node));
+  } else {
+    std::vector<GraphNode> nodes;
+    nodes.push_back(std::move(node));
+    insert_graph_dependencies(std::move(nodes));
+  }
+}
+
+void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
+  std::vector<GraphNode> deps;
+  {
+    // Dependencies must be added in the same order to produce a consistent
+    // topology
+    std::unordered_set<cudaGraphNode_t> set_deps;
+    for (auto d : active_deps_) {
+      if (auto it = node_map_.find(d); it != node_map_.end()) {
+        auto [_, inserted] = set_deps.insert(it->second.node);
+        if (inserted) {
+          deps.push_back(it->second);
+        }
+      }
+    }
+  }
+  active_deps_.clear();
+
+  for (auto o : active_outputs_) {
+    for (auto& node : nodes) {
+      node_map_.emplace(o, node).first->second = node;
+    }
+  }
+  active_outputs_.clear();
+
+  for (auto& from : deps) {
+    for (auto& to : nodes) {
+      from_nodes_.push_back(from.node);
+      to_nodes_.push_back(to.node);
+      graph_key_ += from.id;
+      graph_key_ += from.node_type;
+      graph_key_ += to.id;
+      graph_key_ += to.node_type;
+    }
+  }
+}
+
+CommandEncoder& Device::get_command_encoder(Stream s) {
+  auto it = encoders_.find(s.index);
+  if (it == encoders_.end()) {
+    it = encoders_.try_emplace(s.index, *this).first;
  }
  return it->second;
 }

-CommandEncoder::CommandEncoder(DeviceStream& s)
-    : device_(s.device()), stream_(s) {}
+CommandEncoder::CommandEncoder(Device& d) : stream_(d) {
+  CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
+}
+
+void clear_graphs(std::unordered_map<std::string, cudaGraphExec_t>& graphs) {
+  for (auto& [_, graph_exec] : graphs) {
+    CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
+  }
+  graphs.clear();
+}
+
+CommandEncoder::~CommandEncoder() {
+  clear_graphs(graph_cache_);
+}

 void CommandEncoder::add_completed_handler(std::function<void()> task) {
  worker_.add_task(std::move(task));
 }

-void CommandEncoder::end_encoding() {
-  if (!temporaries_.empty()) {
-    add_completed_handler([temporaries = std::move(temporaries_)]() {});
-  }
+void CommandEncoder::set_input_array(const array& arr) {
+  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
+  active_deps_.push_back(id);
+}

-  // There is no kernel running, run completion handlers immediately.
-  if (!has_gpu_work_) {
-    worker_.consume_in_this_thread();
-    return;
-  }
-  has_gpu_work_ = false;
+void CommandEncoder::set_output_array(const array& arr) {
+  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
+  active_deps_.push_back(id);
+  active_outputs_.push_back(id);
+}

-  // Put completion handlers in a batch.
-  worker_.end_batch();
-
-  // Signaling kernel completion is expensive, delay until enough batches.
-  // TODO: This number is arbitrarily picked, profile for a better stragety.
-  if (worker_.uncommited_batches() > 8) {
+void CommandEncoder::maybe_commit() {
+  if (node_count_ >= env::max_ops_per_buffer(default_max_nodes_per_graph)) {
    commit();
  }
 }

+void CommandEncoder::add_kernel_node(
+    void* func,
+    dim3 grid_dim,
+    dim3 block_dim,
+    void** params) {
+  cudaKernelNodeParams kernel_params = {0};
+  kernel_params.func = func;
+  kernel_params.gridDim = grid_dim;
+  kernel_params.blockDim = block_dim;
+  kernel_params.kernelParams = params;
+  cudaGraphNode_t node;
+  CHECK_CUDA_ERROR(
+      cudaGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
+  insert_graph_dependencies(GraphNode{node, 'K'});
+}
+
+void CommandEncoder::add_kernel_node(
+    CUfunction func,
+    dim3 grid_dim,
+    dim3 block_dim,
+    void** params) {
+  CUDA_KERNEL_NODE_PARAMS kernel_params = {0};
+  kernel_params.func = func;
+  kernel_params.gridDimX = grid_dim.x;
+  kernel_params.gridDimY = grid_dim.y;
+  kernel_params.gridDimZ = grid_dim.z;
+  kernel_params.blockDimX = block_dim.x;
+  kernel_params.blockDimY = block_dim.y;
+  kernel_params.blockDimZ = block_dim.z;
+  kernel_params.kernelParams = params;
+  CUgraphNode node;
+  CHECK_CUDA_ERROR(
+      cuGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
+  insert_graph_dependencies(GraphNode{node, 'K'});
+}
+
 void CommandEncoder::commit() {
-  worker_.commit(stream_.last_cuda_stream());
+  if (!temporaries_.empty()) {
+    add_completed_handler([temporaries = std::move(temporaries_)]() {});
+  }
+  if (node_count_ > 0) {
+    if (!from_nodes_.empty()) {
+      CHECK_CUDA_ERROR(cudaGraphAddDependencies(
+          graph_, from_nodes_.data(), to_nodes_.data(), from_nodes_.size()));
+    }
+
+    graph_key_ += ".";
+    graph_key_ += std::to_string(node_count_);
+    graph_key_ += ".";
+    graph_key_ += std::to_string(graph_node_count_);
+    graph_key_ += ".";
+    graph_key_ += std::to_string(empty_node_count_);
+    auto [it, _] = graph_cache_.emplace(graph_key_, nullptr);
+    auto& graph_exec = it->second;
+
+    if (graph_exec != NULL) {
+      cudaGraphExecUpdateResultInfo update_result;
+      cudaGraphExecUpdate(graph_exec, graph_, &update_result);
+      if (update_result.result != cudaGraphExecUpdateSuccess) {
+        cudaGetLastError();
+        CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
+        graph_exec = NULL;
+      }
+    }
+    if (graph_exec == NULL) {
+      CHECK_CUDA_ERROR(
+          cudaGraphInstantiate(&graph_exec, graph_, NULL, NULL, 0));
+    }
+    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
+
+    // TODO smarter cache policy
+    if (graph_cache_.size() > cuda_graph_cache_size()) {
+      clear_graphs(graph_cache_);
+    }
+
+    // Reset state
+    node_count_ = 0;
+    graph_node_count_ = 0;
+    from_nodes_.clear();
+    to_nodes_.clear();
+    graph_key_.clear();
+    node_map_.clear();
+    CHECK_CUDA_ERROR(cudaGraphDestroy(graph_));
+    CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
+  }
+
+  // Put completion handlers in a batch.
+  worker_.end_batch();
+  worker_.commit(stream_);
+}
+
+void CommandEncoder::synchronize() {
+  cudaStreamSynchronize(stream_);
+  auto p = std::make_shared<std::promise<void>>();
+  std::future<void> f = p->get_future();
+  add_completed_handler([p = std::move(p)]() { p->set_value(); });
+  worker_.end_batch();
+  commit();
+  f.wait();
 }

 Device& device(mlx::core::Device device) {
@@ -116,12 +322,8 @@ Device& device(mlx::core::Device device) {
  return it->second;
 }

-DeviceStream& get_stream(Stream s) {
-  return device(s.device).get_stream(s);
-}
-
 CommandEncoder& get_command_encoder(Stream s) {
-  return get_stream(s).get_encoder();
+  return device(s.device).get_command_encoder(s);
 }

 } // namespace cu
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@@ -7,41 +7,108 @@
 #include "mlx/stream.h"

 #include <cublasLt.h>
+#include <cuda.h>
 #include <thrust/execution_policy.h>

 #include <unordered_map>

 namespace mlx::core::cu {

-class Device;
-class CommandEncoder;
-
-class DeviceStream {
+class CommandEncoder {
 public:
-  explicit DeviceStream(Device& device);
+  struct CaptureContext {
+    CaptureContext(CommandEncoder& enc);
+    ~CaptureContext();
+    cudaGraph_t graph;
+    CommandEncoder& enc;
+  };
+  struct ConcurrentContext {
+    ConcurrentContext(CommandEncoder& enc);
+    ~ConcurrentContext();
+    CommandEncoder& enc;
+  };

-  DeviceStream(const DeviceStream&) = delete;
-  DeviceStream& operator=(const DeviceStream&) = delete;
+  explicit CommandEncoder(Device& d);
+  ~CommandEncoder();

-  // Wait until kernels in the stream complete.
-  void synchronize();
+  CommandEncoder(const CommandEncoder&) = delete;
+  CommandEncoder& operator=(const CommandEncoder&) = delete;

-  // Return a cuda stream for launching kernels.
-  cudaStream_t schedule_cuda_stream();
-
-  // Return the last cuda stream used.
-  cudaStream_t last_cuda_stream();
-
-  CommandEncoder& get_encoder();
-
-  Device& device() {
-    return device_;
+  CaptureContext capture_context() {
+    return CaptureContext{*this};
+  }
+  ConcurrentContext concurrent_context() {
+    return ConcurrentContext{*this};
  }

+  void set_input_array(const array& arr);
+  void set_output_array(const array& arr);
+
+  template <typename F, typename... Params>
+  void
+  add_kernel_node(F* func, dim3 grid_dim, dim3 block_dim, Params&&... params) {
+    constexpr size_t num = sizeof...(Params);
+    void* ptrs[num];
+    size_t i = 0;
+    ([&](auto&& p) { ptrs[i++] = static_cast<void*>(&p); }(
+         std::forward<Params>(params)),
+     ...);
+    add_kernel_node((void*)func, grid_dim, block_dim, ptrs);
+  }
+
+  void add_kernel_node(
+      CUfunction func,
+      dim3 grid_dim,
+      dim3 block_dim,
+      void** params);
+
+  void
+  add_kernel_node(void* func, dim3 grid_dim, dim3 block_dim, void** params);
+
+  void add_temporary(const array& arr) {
+    temporaries_.push_back(arr.data_shared_ptr());
+  }
+
+  void add_completed_handler(std::function<void()> task);
+  void maybe_commit();
+  void commit();
+
+  CudaStream& stream() {
+    return stream_;
+  }
+
+  // Wait until kernels and completion handlers are finished
+  void synchronize();
+
 private:
-  Device& device_;
+  struct GraphNode {
+    cudaGraphNode_t node;
+    // K = kernel
+    // E = empty
+    // G = subgraph
+    char node_type;
+    std::string id;
+  };
+
+  void insert_graph_dependencies(GraphNode node);
+  void insert_graph_dependencies(std::vector<GraphNode> nodes);
+
  CudaStream stream_;
-  std::unique_ptr<CommandEncoder> encoder_;
+  cudaGraph_t graph_;
+  Worker worker_;
+  char node_count_{0};
+  char graph_node_count_{0};
+  char empty_node_count_{0};
+  bool in_concurrent_{false};
+  std::vector<cudaGraphNode_t> from_nodes_;
+  std::vector<cudaGraphNode_t> to_nodes_;
+  std::string graph_key_;
+  std::vector<GraphNode> concurrent_nodes_;
+  std::vector<std::shared_ptr<array::Data>> temporaries_;
+  std::unordered_map<std::string, cudaGraphExec_t> graph_cache_;
+  std::vector<std::uintptr_t> active_deps_;
+  std::vector<std::uintptr_t> active_outputs_;
+  std::unordered_map<std::uintptr_t, GraphNode> node_map_;
 };

 class Device {
@@ -55,7 +122,7 @@ class Device {
  // Make this device the current cuda device, required by some cuda calls.
  void make_current();

-  DeviceStream& get_stream(Stream s);
+  CommandEncoder& get_command_encoder(Stream s);

  int cuda_device() const {
    return device_;
@@ -75,64 +142,10 @@ class Device {
  int compute_capability_major_;
  int compute_capability_minor_;
  cublasLtHandle_t lt_;
-  std::unordered_map<int, DeviceStream> streams_;
-};
-
-class CommandEncoder {
- public:
-  explicit CommandEncoder(DeviceStream& stream);
-
-  CommandEncoder(const CommandEncoder&) = delete;
-  CommandEncoder& operator=(const CommandEncoder&) = delete;
-
-  void set_input_array(const array& arr) {}
-  void set_output_array(const array& arr) {}
-
-  void add_temporary(const array& arr) {
-    temporaries_.push_back(arr.data_shared_ptr());
-  }
-
-  void add_completed_handler(std::function<void()> task);
-  void end_encoding();
-  void commit();
-
-  // Schedule a cuda stream for |fun| to launch kernels, and check error
-  // afterwards.
-  template <typename F>
-  void launch_kernel(F&& fun) {
-    launch_kernel(stream_.schedule_cuda_stream(), std::forward<F>(fun));
-  }
-
-  template <typename F>
-  void launch_kernel(cudaStream_t stream, F&& fun) {
-    device_.make_current();
-    fun(stream);
-    check_cuda_error("kernel launch", cudaGetLastError());
-    has_gpu_work_ = true;
-  }
-
-  Device& device() {
-    return device_;
-  }
-
-  DeviceStream& stream() {
-    return stream_;
-  }
-
-  bool has_gpu_work() const {
-    return has_gpu_work_;
-  }
-
- private:
-  Device& device_;
-  DeviceStream& stream_;
-  Worker worker_;
-  bool has_gpu_work_{false};
-  std::vector<std::shared_ptr<array::Data>> temporaries_;
+  std::unordered_map<int, CommandEncoder> encoders_;
 };

 Device& device(mlx::core::Device device);
-DeviceStream& get_stream(Stream s);
 CommandEncoder& get_command_encoder(Stream s);

 // Return an execution policy that does not sync for result.
--- a/mlx/backend/cuda/device/binary_ops.cuh
+++ b/mlx/backend/cuda/device/binary_ops.cuh
@@ -22,7 +22,7 @@ struct FloorDivide {
    if constexpr (cuda::std::is_integral_v<T>) {
      return x / y;
    } else {
-      return trunc(x / y);
+      return truncf(x / y);
    }
  }
 };
@@ -132,7 +132,7 @@ struct LogAddExp {
          cuda::std::numeric_limits<float>::quiet_NaN(),
          cuda::std::numeric_limits<float>::quiet_NaN()};
    }
-    constexpr float inf = cuda::std::numeric_limits<float>::infinity();
+    float inf = cuda::std::numeric_limits<float>::infinity();
    auto maxval = x > y ? x : y;
    auto minval = x < y ? x : y;
    if (cuCrealf(minval) == -inf || cuCrealf(maxval) == inf)
--- a/mlx/backend/cuda/device/config.h
+++ b/mlx/backend/cuda/device/config.h
@@ -5,7 +5,7 @@
 #pragma once

 // The maximum dimensions of shape/strides passed as kernel parameters.
-#define MAX_NDIM 8
+#define MAX_NDIM 10

 // All existing NVIDIA hardware has a fixed 32 warp size. Though a built-in
 // warpSize variable exists, using it would prevent compile-time optimizations.
--- a/mlx/backend/cuda/device/unary_ops.cuh
+++ b/mlx/backend/cuda/device/unary_ops.cuh
@@ -27,6 +27,8 @@ struct ArcCos {
  __device__ T operator()(T x) {
    return acos(x);
  }
+
+  __device__ cuComplex operator()(cuComplex x);
 };

 struct ArcCosh {
@@ -41,6 +43,8 @@ struct ArcSin {
  __device__ T operator()(T x) {
    return asin(x);
  }
+
+  __device__ cuComplex operator()(cuComplex x);
 };

 struct ArcSinh {
@@ -55,6 +59,8 @@ struct ArcTan {
  __device__ T operator()(T x) {
    return atan(x);
  }
+
+  __device__ cuComplex operator()(cuComplex x);
 };

 struct ArcTanh {
@@ -261,13 +267,6 @@ struct Round {
  }
 };

-struct Rsqrt {
-  template <typename T>
-  __device__ T operator()(T x) {
-    return rsqrt(x);
-  }
-};
-
 struct Sigmoid {
  template <typename T>
  __device__ T operator()(T x) {
@@ -333,6 +332,29 @@ struct Sqrt {
  __device__ T operator()(T x) {
    return sqrt(x);
  }
+
+  __device__ cuComplex operator()(cuComplex x) {
+    auto xr = cuCrealf(x);
+    auto xi = cuCimagf(x);
+    if (xr == 0.0f && xi == 0.0f) {
+      return {0.0f, 0.0f};
+    }
+    auto r = cuCrealf(Abs{}(x));
+    auto a = sqrt((r + xr) / 2.0f);
+    auto b_abs = sqrt((r - xr) / 2.0f);
+    auto b = copysign(b_abs, xi);
+    return {a, b};
+  }
+};
+
+struct Rsqrt {
+  template <typename T>
+  __device__ T operator()(T x) {
+    return rsqrt(x);
+  }
+  __device__ cuComplex operator()(cuComplex x) {
+    return 1.0f / Sqrt{}(x);
+  }
 };

 struct Tan {
@@ -365,4 +387,22 @@ struct Tanh {
  }
 };

+__device__ cuComplex ArcCos::operator()(cuComplex x) {
+  auto i = cuComplex{0.0, 1.0};
+  auto y = Log{}(x + i * Sqrt{}(1.0 - x * x));
+  return {cuCimagf(y), -cuCrealf(y)};
+};
+
+__device__ cuComplex ArcSin::operator()(cuComplex x) {
+  auto i = cuComplex{0.0f, 1.0f};
+  auto y = Log{}(i * x + Sqrt{}(1.0f - x * x));
+  return {cuCimagf(y), -cuCrealf(y)};
+};
+
+__device__ cuComplex ArcTan::operator()(cuComplex x) {
+  auto i = cuComplex{0.0f, 1.0f};
+  auto ix = i * x;
+  return (1.0f / cuComplex{0.0f, 2.0f}) * Log{}((1.0f + ix) / (1.0f - ix));
+};
+
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/utils.cuh
+++ b/mlx/backend/cuda/device/utils.cuh
@@ -28,6 +28,27 @@ namespace mlx::core::cu {
 using Shape = cuda::std::array<int32_t, MAX_NDIM>;
 using Strides = cuda::std::array<int64_t, MAX_NDIM>;

+// Vectorized load/store.
+template <typename T, int N>
+struct alignas(sizeof(T) * N) AlignedVector {
+  T val[N];
+};
+
+template <int N, typename T>
+inline __device__ AlignedVector<T, N> load_vector(
+    const T* ptr,
+    uint32_t offset) {
+  auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
+  return from[offset];
+}
+
+template <int N, typename T>
+inline __device__ void
+store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
+  auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
+  to[offset] = vec;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Type limits utils
 ///////////////////////////////////////////////////////////////////////////////
@@ -155,8 +176,8 @@ inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc_nd(
 #pragma unroll
  for (int i = NDIM - 1; i >= 0; --i) {
    int dim_idx = elem % shape[i];
-    a_loc += dim_idx * a_strides[i];
-    b_loc += dim_idx * b_strides[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
    elem /= shape[i];
  }
  return cuda::std::make_tuple(a_loc, b_loc);
@@ -175,9 +196,9 @@ inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_nd(
 #pragma unroll
  for (int i = NDIM - 1; i >= 0; --i) {
    int dim_idx = elem % shape[i];
-    a_loc += dim_idx * a_strides[i];
-    b_loc += dim_idx * b_strides[i];
-    c_loc += dim_idx * c_strides[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
+    c_loc += dim_idx * IdxT(c_strides[i]);
    elem /= shape[i];
  }
  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
@@ -206,8 +227,8 @@ inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc_4d(
  IdxT b_loc = 0;
  for (int i = ndim - 1; i >= 0; --i) {
    int dim_idx = elem % shape[i];
-    a_loc += dim_idx * a_strides[i];
-    b_loc += dim_idx * b_strides[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
    elem /= shape[i];
  }
  return cuda::std::make_tuple(a_loc, b_loc);
@@ -226,9 +247,9 @@ inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_4d(
  IdxT c_loc = 0;
  for (int i = ndim - 1; i >= 0; --i) {
    int dim_idx = elem % shape[i];
-    a_loc += dim_idx * a_strides[i];
-    b_loc += dim_idx * b_strides[i];
-    c_loc += dim_idx * c_strides[i];
+    a_loc += dim_idx * IdxT(a_strides[i]);
+    b_loc += dim_idx * IdxT(b_strides[i]);
+    c_loc += dim_idx * IdxT(c_strides[i]);
    elem /= shape[i];
  }
  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@@ -37,22 +37,20 @@ void eval(array& arr) {
  }

  auto& encoder = cu::get_command_encoder(arr.primitive().stream());
-  if (encoder.has_gpu_work()) {
-    // Keep used buffers alive until kernel finishes running.
-    std::unordered_set<std::shared_ptr<array::Data>> buffers;
-    for (auto& in : arr.inputs()) {
-      buffers.insert(in.data_shared_ptr());
-    }
-    for (auto& s : arr.siblings()) {
-      buffers.insert(s.data_shared_ptr());
-    }
-    // Remove the output if it was donated to by an input.
-    if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
-      buffers.erase(it);
-    }
-    encoder.add_completed_handler([buffers = std::move(buffers)]() {});
+  // Keep used buffers alive until kernel finishes running.
+  std::unordered_set<std::shared_ptr<array::Data>> buffers;
+  for (auto& in : arr.inputs()) {
+    buffers.insert(in.data_shared_ptr());
  }
-  encoder.end_encoding();
+  for (auto& s : arr.siblings()) {
+    buffers.insert(s.data_shared_ptr());
+  }
+  // Remove the output if it was donated to by an input.
+  if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
+    buffers.erase(it);
+  }
+  encoder.add_completed_handler([buffers = std::move(buffers)]() {});
+  encoder.maybe_commit();
 }

 void finalize(Stream s) {
@@ -62,7 +60,7 @@ void finalize(Stream s) {

 void synchronize(Stream s) {
  nvtx3::scoped_range r("gpu::synchronize");
-  cu::get_stream(s).synchronize();
+  cu::get_command_encoder(s).synchronize();
 }

 } // namespace mlx::core::gpu
--- a/mlx/backend/cuda/event.cu
+++ b/mlx/backend/cuda/event.cu
@@ -61,7 +61,9 @@ void CudaEvent::wait(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    scheduler::enqueue(s, [*this]() mutable { wait(); });
  } else {
-    wait(cu::get_stream(s).last_cuda_stream());
+    auto& enc = cu::get_command_encoder(s);
+    enc.commit();
+    wait(enc.stream());
  }
 }

@@ -74,7 +76,9 @@ void CudaEvent::record(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    throw std::runtime_error("CudaEvent can not wait on cpu stream.");
  } else {
-    record(cu::get_stream(s).last_cuda_stream());
+    auto& enc = cu::get_command_encoder(s);
+    enc.commit();
+    record(enc.stream());
  }
 }

@@ -136,11 +140,9 @@ void SharedEvent::wait(Stream s, uint64_t value) {
    scheduler::enqueue(s, [*this, value]() mutable { wait(value); });
  } else {
    auto& encoder = get_command_encoder(s);
-    encoder.launch_kernel(
-        encoder.stream().last_cuda_stream(),
-        [this, value](cudaStream_t stream) { wait(stream, value); });
+    encoder.commit();
+    wait(encoder.stream(), value);
    encoder.add_completed_handler([ac = ac_]() {});
-    encoder.end_encoding();
  }
 }

@@ -162,11 +164,9 @@ void SharedEvent::signal(Stream s, uint64_t value) {
    scheduler::enqueue(s, [*this, value]() mutable { signal(stream, value); });
  } else {
    auto& encoder = get_command_encoder(s);
-    encoder.launch_kernel(
-        encoder.stream().last_cuda_stream(),
-        [this, value](cudaStream_t stream) { signal(stream, value); });
+    encoder.commit();
+    signal(encoder.stream(), value);
    encoder.add_completed_handler([ac = ac_]() {});
-    encoder.end_encoding();
  }
 }

--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@@ -3,13 +3,16 @@
 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/jit_module.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"

 #include "cuda_jit_sources.h"

+#include <cuda.h>
 #include <fmt/format.h>
+#include <nvrtc.h>
 #include <nvtx3/nvtx3.hpp>

 #include <cassert>
@@ -22,7 +25,7 @@ namespace {
 constexpr const char* g_scatter_ops[] = {"Max", "Min", "Sum", "Prod", "Assign"};

 void append_indices_arg(
-    cu::JitModule& mod,
+    cu::KernelArgs& args,
    const std::vector<array>& inputs,
    int nidx,
    int idx_ndim) {
@@ -30,7 +33,7 @@ void append_indices_arg(
  for (int i = 0; i < nidx; ++i) {
    indices[i] = inputs[i + 1].data<void>();
  }
-  mod.append_arg(std::move(indices));
+  args.append(std::move(indices));
  std::vector<int32_t> indices_shape(nidx * idx_ndim);
  for (int i = 0; i < nidx; ++i) {
    std::copy_n(
@@ -38,7 +41,7 @@ void append_indices_arg(
        idx_ndim,
        indices_shape.data() + i * idx_ndim);
  }
-  mod.append_arg(std::move(indices_shape));
+  args.append(std::move(indices_shape));
  std::vector<int64_t> indices_strides(nidx * idx_ndim);
  for (int i = 0; i < nidx; ++i) {
    std::copy_n(
@@ -46,7 +49,7 @@ void append_indices_arg(
        idx_ndim,
        indices_strides.data() + i * idx_ndim);
  }
-  mod.append_arg(std::move(indices_strides));
+  args.append(std::move(indices_strides));
 }

 } // namespace
@@ -94,20 +97,21 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
    return std::make_pair(jit_source_gather, std::move(kernel_names));
  });

-  mod.append_arg(src);
-  mod.append_arg(out);
+  cu::KernelArgs args;
+  args.append(src);
+  args.append(out);
  if (large) {
-    mod.append_arg<int64_t>(out.size());
+    args.append<int64_t>(out.size());
  } else {
-    mod.append_arg<int32_t>(out.size());
+    args.append<int32_t>(out.size());
  }
-  mod.append_ndim_arg(src.shape());
-  mod.append_ndim_arg(src.strides());
-  mod.append_arg<int32_t>(src.ndim());
-  mod.append_ndim_arg(slice_sizes_);
-  mod.append_arg(slice_size);
-  mod.append_arg(axes_);
-  append_indices_arg(mod, inputs, nidx, idx_ndim);
+  args.append_ndim(src.shape());
+  args.append_ndim(src.strides());
+  args.append<int32_t>(src.ndim());
+  args.append_ndim(slice_sizes_);
+  args.append(slice_size);
+  args.append(axes_);
+  append_indices_arg(args, inputs, nidx, idx_ndim);

  std::string kernel_name = fmt::format(
      "mlx::core::cu::gather<{}, {}, {}, {}, {}>",
@@ -122,9 +126,10 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    mod.launch_kernel(stream, kernel_name, out, large);
-  });
+
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

 void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -187,26 +192,27 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
    return std::make_pair(jit_source_scatter, std::move(kernel_names));
  });

-  mod.append_arg(upd);
-  mod.append_arg(out);
+  cu::KernelArgs args;
+  args.append(upd);
+  args.append(out);
  if (large) {
-    mod.append_arg<int64_t>(upd.size());
+    args.append<int64_t>(upd.size());
  } else {
-    mod.append_arg<int32_t>(upd.size());
+    args.append<int32_t>(upd.size());
  }
-  mod.append_ndim_arg(upd.shape());
-  mod.append_ndim_arg(upd.strides());
-  mod.append_arg<int32_t>(upd.ndim());
+  args.append_ndim(upd.shape());
+  args.append_ndim(upd.strides());
+  args.append<int32_t>(upd.ndim());
  if (large) {
-    mod.append_arg<int64_t>(upd_post_idx_size);
+    args.append<int64_t>(upd_post_idx_size);
  } else {
-    mod.append_arg<int32_t>(upd_post_idx_size);
+    args.append<int32_t>(upd_post_idx_size);
  }
-  mod.append_ndim_arg(out.shape());
-  mod.append_ndim_arg(out.strides());
-  mod.append_arg<int32_t>(out.ndim());
-  mod.append_arg(axes_);
-  append_indices_arg(mod, inputs, nidx, idx_ndim);
+  args.append_ndim(out.shape());
+  args.append_ndim(out.strides());
+  args.append<int32_t>(out.ndim());
+  args.append(axes_);
+  append_indices_arg(args, inputs, nidx, idx_ndim);

  std::string kernel_name = fmt::format(
      "mlx::core::cu::scatter<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}>",
@@ -222,9 +228,9 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    mod.launch_kernel(stream, kernel_name, upd, large);
-  });
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, upd, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

 void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -275,25 +281,26 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  size_t idx_size_axis = idx.shape(axis_);

-  mod.append_arg(src);
-  mod.append_arg(idx);
-  mod.append_arg(out);
+  cu::KernelArgs args;
+  args.append(src);
+  args.append(idx);
+  args.append(out);
  if (large) {
-    mod.append_arg<int64_t>(idx_size_pre);
-    mod.append_arg<int64_t>(idx_size_axis);
-    mod.append_arg<int64_t>(idx_size_post);
+    args.append<int64_t>(idx_size_pre);
+    args.append<int64_t>(idx_size_axis);
+    args.append<int64_t>(idx_size_post);
  } else {
-    mod.append_arg<int32_t>(idx_size_pre);
-    mod.append_arg<int32_t>(idx_size_axis);
-    mod.append_arg<int32_t>(idx_size_post);
+    args.append<int32_t>(idx_size_pre);
+    args.append<int32_t>(idx_size_axis);
+    args.append<int32_t>(idx_size_post);
  }
-  mod.append_arg(remove_index(idx.shape(), axis_));
-  mod.append_arg(remove_index(src.strides(), axis_));
-  mod.append_arg(remove_index(idx.strides(), axis_));
-  mod.append_arg<int32_t>(axis_);
-  mod.append_arg(src.shape(axis_));
-  mod.append_arg(src.strides(axis_));
-  mod.append_arg(idx.strides(axis_));
+  args.append(remove_index(idx.shape(), axis_));
+  args.append(remove_index(src.strides(), axis_));
+  args.append(remove_index(idx.strides(), axis_));
+  args.append<int32_t>(axis_);
+  args.append(src.shape(axis_));
+  args.append(src.strides(axis_));
+  args.append(idx.strides(axis_));

  std::string kernel_name = fmt::format(
      "mlx::core::cu::gather_axis<{}, {}, {}, {}, {}, {}>",
@@ -309,9 +316,9 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    mod.launch_kernel(stream, kernel_name, idx, large);
-  });
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

 void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -377,25 +384,26 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  size_t idx_size_axis = idx.shape(axis_);

-  mod.append_arg(upd);
-  mod.append_arg(idx);
-  mod.append_arg(out);
+  cu::KernelArgs args;
+  args.append(upd);
+  args.append(idx);
+  args.append(out);
  if (large) {
-    mod.append_arg<int64_t>(idx_size_pre);
-    mod.append_arg<int64_t>(idx_size_axis);
-    mod.append_arg<int64_t>(idx_size_post);
+    args.append<int64_t>(idx_size_pre);
+    args.append<int64_t>(idx_size_axis);
+    args.append<int64_t>(idx_size_post);
  } else {
-    mod.append_arg<int32_t>(idx_size_pre);
-    mod.append_arg<int32_t>(idx_size_axis);
-    mod.append_arg<int32_t>(idx_size_post);
+    args.append<int32_t>(idx_size_pre);
+    args.append<int32_t>(idx_size_axis);
+    args.append<int32_t>(idx_size_post);
  }
-  mod.append_arg(remove_index(idx.shape(), axis_));
-  mod.append_arg(remove_index(upd.strides(), axis_));
-  mod.append_arg(remove_index(idx.strides(), axis_));
-  mod.append_arg<int32_t>(axis_);
-  mod.append_arg(out.shape(axis_));
-  mod.append_arg(upd.strides(axis_));
-  mod.append_arg(idx.strides(axis_));
+  args.append(remove_index(idx.shape(), axis_));
+  args.append(remove_index(upd.strides(), axis_));
+  args.append(remove_index(idx.strides(), axis_));
+  args.append<int32_t>(axis_);
+  args.append(out.shape(axis_));
+  args.append(upd.strides(axis_));
+  args.append(idx.strides(axis_));

  std::string kernel_name = fmt::format(
      "mlx::core::cu::scatter_axis<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}, {}>",
@@ -412,9 +420,9 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    mod.launch_kernel(stream, kernel_name, idx, large);
-  });
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/jit_module.cpp
+++ b/mlx/backend/cuda/jit_module.cpp
@@ -26,47 +26,47 @@ void check_nvrtc_error(const char* name, nvrtcResult err) {
  }
 }

-#define CHECK_CU_ERROR(cmd) check_cu_error(#cmd, (cmd))
-
-void check_cu_error(const char* name, CUresult err) {
-  if (err != CUDA_SUCCESS) {
-    const char* err_str = "Unknown error";
-    cuGetErrorString(err, &err_str);
-    throw std::runtime_error(fmt::format("{} failed: {}", name, err_str));
-  }
-}
-
 // Return the location of the CUDA toolkit.
-const char* cuda_home() {
-  const char* home = std::getenv("CUDA_HOME");
-  if (home) {
-    return home;
-  }
-  home = std::getenv("CUDA_PATH");
-  if (home) {
-    return home;
-  }
+const std::string& cuda_home() {
+  static std::string home = []() -> std::string {
+    const char* home = std::getenv("CUDA_HOME");
+    if (home) {
+      return home;
+    }
+    home = std::getenv("CUDA_PATH");
+    if (home) {
+      return home;
+    }
 #if defined(__linux__)
-  home = "/usr/local/cuda";
-  if (std::filesystem::exists(home)) {
-    return home;
-  }
+    home = "/usr/local/cuda";
+    if (std::filesystem::exists(home)) {
+      return home;
+    }
 #endif
-  throw std::runtime_error(
-      "Environment variable CUDA_HOME or CUDA_PATH is not set.");
+    throw std::runtime_error(
+        "Environment variable CUDA_HOME or CUDA_PATH is not set.");
+  }();
+  return home;
 }

 // Get the cache directory for storing compiled results.
-bool get_ptx_cache_dir(std::filesystem::path* result) {
-  auto path = std::filesystem::temp_directory_path() / "mlx" / "ptx";
-  if (!std::filesystem::is_directory(path)) {
-    std::error_code error;
-    if (!std::filesystem::create_directories(path, error)) {
-      return false;
+const std::filesystem::path& ptx_cache_dir() {
+  static std::filesystem::path cache = []() -> std::filesystem::path {
+    std::filesystem::path cache;
+    if (auto c = std::getenv("MLX_PTX_CACHE"); c) {
+      cache = c;
+    } else {
+      cache = std::filesystem::temp_directory_path() / "mlx" / "ptx";
    }
-  }
-  *result = path;
-  return true;
+    if (!std::filesystem::exists(cache)) {
+      std::error_code error;
+      if (!std::filesystem::create_directories(cache, error)) {
+        return std::filesystem::path();
+      }
+    }
+    return cache;
+  }();
+  return cache;
 }

 // Try to read the cached |ptx| and |ptx_kernels| from |cache_dir|.
@@ -75,6 +75,10 @@ bool read_cached_ptx(
    const std::string& module_name,
    std::vector<char>* ptx,
    std::vector<std::pair<std::string, std::string>>* ptx_kernels) {
+  if (cache_dir.empty()) {
+    return false;
+  }
+
  auto ptx_path = cache_dir / (module_name + ".ptx");
  std::error_code error;
  auto ptx_size = std::filesystem::file_size(ptx_path, error);
@@ -105,6 +109,10 @@ void write_cached_ptx(
    const std::string& module_name,
    const std::vector<char>& ptx,
    const std::vector<std::pair<std::string, std::string>>& ptx_kernels) {
+  if (cache_dir.empty()) {
+    return;
+  }
+
  std::ofstream ptx_file(cache_dir / (module_name + ".ptx"), std::ios::binary);
  if (!ptx.empty()) {
    ptx_file.write(&ptx.front(), ptx.size());
@@ -184,11 +192,9 @@ JitModule::JitModule(
    const std::string& module_name,
    const KernelBuilder& builder) {
  // Check cache.
-  std::filesystem::path cache_dir;
  std::vector<char> ptx;
  std::vector<std::pair<std::string, std::string>> ptx_kernels;
-  if (!get_ptx_cache_dir(&cache_dir) ||
-      !read_cached_ptx(cache_dir, module_name, &ptx, &ptx_kernels)) {
+  if (!read_cached_ptx(ptx_cache_dir(), module_name, &ptx, &ptx_kernels)) {
    // Create program.
    auto [source_code, kernel_names] = builder();
    nvrtcProgram prog;
@@ -246,7 +252,7 @@ JitModule::JitModule(
    } else {
      CHECK_NVRTC_ERROR(nvrtcGetPTX(prog, ptx.data()));
    }
-    write_cached_ptx(cache_dir, module_name, ptx, ptx_kernels);
+    write_cached_ptx(ptx_cache_dir(), module_name, ptx, ptx_kernels);
  }

  // Load module.
@@ -264,60 +270,13 @@ JitModule::JitModule(
  // Load kernels.
  for (const auto& [name, mangled] : ptx_kernels) {
    CUfunction kernel;
-    CHECK_CU_ERROR(cuModuleGetFunction(&kernel, module_, mangled.c_str()));
+    CHECK_CUDA_ERROR(cuModuleGetFunction(&kernel, module_, mangled.c_str()));
    kernels_[name] = kernel;
  }
 }

 JitModule::~JitModule() {
-  CHECK_CU_ERROR(cuModuleUnload(module_));
-}
-
-void JitModule::launch_kernel(
-    CUstream stream,
-    const std::string& kernel_name,
-    const array& arr,
-    bool large,
-    int work_per_thread) {
-  CUfunction kernel = get_kernel(kernel_name);
-  size_t nthreads = cuda::ceil_div(arr.size(), work_per_thread);
-  int _, block_dim;
-  CHECK_CU_ERROR(
-      cuOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel, 0, 0, 0));
-  if (block_dim > nthreads) {
-    block_dim = nthreads;
-  }
-  Dims num_blocks{1, 1, 1};
-  if (large) {
-    num_blocks =
-        get_2d_grid_dims_common(arr.shape(), arr.strides(), work_per_thread);
-    std::get<0>(num_blocks) =
-        (std::get<0>(num_blocks) + block_dim - 1) / block_dim;
-  } else {
-    std::get<0>(num_blocks) = (nthreads + block_dim - 1) / block_dim;
-  }
-  launch_kernel(stream, kernel, num_blocks, Dims{block_dim, 1, 1});
-}
-
-void JitModule::launch_kernel(
-    CUstream stream,
-    CUfunction kernel,
-    Dims num_blocks,
-    Dims block_dims) {
-  CHECK_CU_ERROR(cuLaunchKernel(
-      kernel,
-      std::get<0>(num_blocks),
-      std::get<1>(num_blocks),
-      std::get<2>(num_blocks),
-      std::get<0>(block_dims),
-      std::get<1>(block_dims),
-      std::get<2>(block_dims),
-      0,
-      stream,
-      args_.data(),
-      nullptr));
-  args_.clear();
-  storage_.clear();
+  CHECK_CUDA_ERROR(cuModuleUnload(module_));
 }

 CUfunction JitModule::get_kernel(const std::string& kernel_name) {
@@ -329,10 +288,6 @@ CUfunction JitModule::get_kernel(const std::string& kernel_name) {
  return it->second;
 }

-void JitModule::append_ptr_arg(const void* v) {
-  args_.push_back(const_cast<void*>(v));
-}
-
 JitModule& get_jit_module(
    const mlx::core::Device& device,
    const std::string& name,
--- a/mlx/backend/cuda/jit_module.h
+++ b/mlx/backend/cuda/jit_module.h
@@ -4,6 +4,7 @@

 #include "mlx/array.h"
 #include "mlx/backend/common/utils.h"
+#include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/device/config.h"

 #include <deque>
@@ -23,72 +24,48 @@ using KernelBuilderResult = std::pair<
    /* kernel names */ std::vector<std::string>>;
 using KernelBuilder = std::function<KernelBuilderResult()>;

-class JitModule {
- public:
-  JitModule(
-      Device& device,
-      const std::string& module_name,
-      const KernelBuilder& builder);
-  ~JitModule();
+struct KernelArgs {
+  void** args() {
+    return args_.data();
+  }

-  JitModule(const JitModule&) = delete;
-  JitModule& operator=(const JitModule&) = delete;
-
-  void append_arg(const array& a) {
-    append_arg(reinterpret_cast<CUdeviceptr>(a.data<void>()));
+  void append(const array& a) {
+    append(reinterpret_cast<CUdeviceptr>(a.data<void>()));
  }

  template <typename T>
-  void append_arg(T val) {
+  void append(T val) {
    storage_.emplace_back(val);
-    append_ptr_arg(&storage_.back());
+    append_ptr(&storage_.back());
  }

  template <typename T>
-  void append_arg(std::vector<T> vec) {
+  void append(std::vector<T> vec) {
    if (vec.empty()) {
      // The nullptr can not be used as arg, pass something not null.
-      append_arg(std::monostate{});
+      append(std::monostate{});
    } else {
-      append_ptr_arg(vec.data());
+      append_ptr(vec.data());
      storage_.emplace_back(std::move(vec));
    }
  }

  // Make sure the arg is copied to an array with size of NDIM.
  template <size_t NDIM = MAX_NDIM, typename T>
-  void append_ndim_arg(const std::vector<T>& vec) {
+  void append_ndim(std::vector<T> vec) {
    if (vec.size() > NDIM) {
      throw std::runtime_error(
          fmt::format("ndim can not be larger than {}.", NDIM));
    }
-    std::vector<T> copied(NDIM);
-    std::copy(vec.begin(), vec.end(), copied.data());
-    append_arg(std::move(copied));
+    vec.resize(NDIM);
+    append(std::move(vec));
  }

-  // Launch kernel with |kernel_name| that each thread works on
-  // |work_per_thread| elements of |arr|.
-  void launch_kernel(
-      CUstream stream,
-      const std::string& kernel_name,
-      const array& arr,
-      bool large,
-      int work_per_thread = 1);
-
-  void launch_kernel(
-      CUstream stream,
-      CUfunction kernel,
-      Dims num_blocks,
-      Dims block_dims);
-
-  CUfunction get_kernel(const std::string& kernel_name);
+  void append_ptr(const void* v) {
+    args_.push_back(const_cast<void*>(v));
+  }

 private:
-  void append_ptr_arg(const void* v);
-
-  CUmodule module_{nullptr};
-  std::unordered_map<std::string, CUfunction> kernels_;
  std::vector<void*> args_;

  // The cuLaunchKernel API requires passing pointers to arguments so store
@@ -105,6 +82,23 @@ class JitModule {
  std::deque<Arg> storage_;
 };

+class JitModule {
+ public:
+  JitModule(
+      Device& device,
+      const std::string& module_name,
+      const KernelBuilder& builder);
+  ~JitModule();
+
+  JitModule(const JitModule&) = delete;
+  JitModule& operator=(const JitModule&) = delete;
+  CUfunction get_kernel(const std::string& kernel_name);
+
+ private:
+  CUmodule module_{nullptr};
+  std::unordered_map<std::string, CUfunction> kernels_;
+};
+
 JitModule& get_jit_module(
    const mlx::core::Device& device,
    const std::string& name,
--- a/mlx/backend/cuda/kernel_utils.cuh
+++ b/mlx/backend/cuda/kernel_utils.cuh
@@ -6,10 +6,13 @@

 #pragma once

+#include <type_traits>
+
 #include "mlx/array.h"
 #include "mlx/backend/cuda/device/utils.cuh"

 #include <cuComplex.h>
+#include <cuda.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <fmt/format.h>
@@ -17,60 +20,46 @@

 namespace mlx::core {

-// Convert a number between 1~3 to constexpr.
-#define MLX_SWITCH_1_2_3(N, NDIM, ...) \
-  switch (N) {                         \
-    case 1: {                          \
-      constexpr int NDIM = 1;          \
-      __VA_ARGS__;                     \
-      break;                           \
-    }                                  \
-    case 2: {                          \
-      constexpr int NDIM = 2;          \
-      __VA_ARGS__;                     \
-      break;                           \
-    }                                  \
-    case 3: {                          \
-      constexpr int NDIM = 3;          \
-      __VA_ARGS__;                     \
-      break;                           \
-    }                                  \
+template <typename F>
+void dispatch_1_2_3(int n, F&& f) {
+  switch (n) {
+    case 1:
+      f(std::integral_constant<int, 1>{});
+      break;
+    case 2:
+      f(std::integral_constant<int, 2>{});
+      break;
+    case 3:
+      f(std::integral_constant<int, 3>{});
+      break;
  }
+}

-// Like MLX_SWITCH_ALL_TYPES but for booleans.
-#define MLX_SWITCH_BOOL(BOOL, BOOL_ALIAS, ...) \
-  if (BOOL) {                                  \
-    constexpr bool BOOL_ALIAS = true;          \
-    __VA_ARGS__;                               \
-  } else {                                     \
-    constexpr bool BOOL_ALIAS = false;         \
-    __VA_ARGS__;                               \
+template <typename F>
+void dispatch_bool(bool v, F&& f) {
+  if (v) {
+    f(std::true_type{});
+  } else {
+    f(std::false_type{});
  }
+}

-// Convert a block_dim to constexpr between WARP_SIZE and WARP_SIZE ^ 2.
-#define MLX_SWITCH_BLOCK_DIM(NUM_THREADS, BLOCK_DIM, ...)   \
-  {                                                         \
-    uint32_t _num_threads = NUM_THREADS;                    \
-    if (_num_threads <= WARP_SIZE) {                        \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE;             \
-      __VA_ARGS__;                                          \
-    } else if (_num_threads <= WARP_SIZE * 2) {             \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE * 2;         \
-      __VA_ARGS__;                                          \
-    } else if (_num_threads <= WARP_SIZE * 4) {             \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE * 4;         \
-      __VA_ARGS__;                                          \
-    } else if (_num_threads <= WARP_SIZE * 8) {             \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE * 8;         \
-      __VA_ARGS__;                                          \
-    } else if (_num_threads <= WARP_SIZE * 16) {            \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE * 16;        \
-      __VA_ARGS__;                                          \
-    } else {                                                \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE * WARP_SIZE; \
-      __VA_ARGS__;                                          \
-    }                                                       \
+template <typename F>
+void dispatch_block_dim(int threads, F&& f) {
+  if (threads <= WARP_SIZE) {
+    f(std::integral_constant<int, WARP_SIZE>{});
+  } else if (threads <= WARP_SIZE * 2) {
+    f(std::integral_constant<int, WARP_SIZE * 2>{});
+  } else if (threads <= WARP_SIZE * 4) {
+    f(std::integral_constant<int, WARP_SIZE * 4>{});
+  } else if (threads <= WARP_SIZE * 8) {
+    f(std::integral_constant<int, WARP_SIZE * 8>{});
+  } else if (threads <= WARP_SIZE * 16) {
+    f(std::integral_constant<int, WARP_SIZE * 16>{});
+  } else {
+    f(std::integral_constant<int, WARP_SIZE * 32>{});
  }
+}

 // Maps CPU types to CUDA types.
 template <typename T>
@@ -132,7 +121,13 @@ std::pair<dim3, dim3> get_grid_and_block(int dim0, int dim1, int dim2);
 template <typename T>
 inline uint max_occupancy_block_dim(T kernel) {
  int _, block_dim;
-  CHECK_CUDA_ERROR(cudaOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel));
+  if constexpr (std::is_same_v<T, CUfunction>) {
+    CHECK_CUDA_ERROR(
+        cuOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel, 0, 0, 0));
+  } else {
+    CHECK_CUDA_ERROR(
+        cudaOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel));
+  }
  return block_dim;
 }

--- a/mlx/backend/cuda/layer_norm.cu
+++ b/mlx/backend/cuda/layer_norm.cu
@@ -258,22 +258,23 @@ void LayerNorm::eval_gpu(
  encoder.set_input_array(w);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_FLOAT_TYPES_CHECKED(out.dtype(), "layernorm", CTYPE, {
-      using DataType = cuda_type_t<CTYPE>;
-      constexpr uint32_t N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        auto kernel = cu::layer_norm<DataType, BLOCK_DIM, N_READS>;
-        kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-            x.data<DataType>(),
-            w.data<DataType>(),
-            b.data<DataType>(),
-            out.data<DataType>(),
-            eps_,
-            axis_size,
-            w_stride,
-            b_stride);
-      });
+  dispatch_float_types(out.dtype(), "layernorm", [&](auto type_tag) {
+    constexpr uint32_t N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      auto kernel = cu::layer_norm<DataType, block_dim(), N_READS>;
+      encoder.add_kernel_node(
+          kernel,
+          n_rows,
+          block_dim(),
+          x.data<DataType>(),
+          w.data<DataType>(),
+          b.data<DataType>(),
+          out.data<DataType>(),
+          eps_,
+          axis_size,
+          w_stride,
+          b_stride);
    });
  });
 }
@@ -288,21 +289,25 @@ void LayerNormVJP::eval_gpu(
  // Ensure row contiguity. We could relax this step by checking that the array
  // is contiguous (no broadcasts or holes) and that the input strides are the
  // same as the cotangent strides but for now this is simpler.
-  auto check_input = [&s](const array& x) -> std::pair<array, bool> {
+  auto check_input = [&s](const array& x, bool& copied) {
    if (x.flags().row_contiguous) {
-      return {x, false};
+      copied = false;
+      return x;
    }
+    copied = true;
    array x_copy(x.shape(), x.dtype(), nullptr, {});
    copy_gpu(x, x_copy, CopyType::General, s);
-    return {x_copy, true};
+    return x_copy;
  };
  bool donate_x = inputs[0].is_donatable();
  bool donate_g = inputs[3].is_donatable();
-  auto [x, copied] = check_input(inputs[0]);
+  bool copied;
+  auto x = check_input(inputs[0], copied);
  donate_x |= copied;
  const array& w = inputs[1];
  const array& b = inputs[2];
-  auto [g, g_copied] = check_input(inputs[3]);
+  bool g_copied;
+  auto g = check_input(inputs[3], g_copied);
  donate_g |= g_copied;
  array& gx = outputs[0];
  array& gw = outputs[1];
@@ -333,47 +338,58 @@ void LayerNormVJP::eval_gpu(
  // gradient accumulators.
  array gw_temp =
      (has_w) ? array({n_rows, x.shape().back()}, gw.dtype(), nullptr, {}) : w;
+  bool g_in_gw = false;
  if (has_w) {
    if (!g_in_gx && donate_g) {
+      g_in_gw = true;
      gw_temp.copy_shared_buffer(g);
    } else {
      gw_temp.set_data(allocator::malloc(gw_temp.nbytes()));
      encoder.add_temporary(gw_temp);
    }
  }
-  gw.set_data(allocator::malloc(gw.nbytes()));
-  gb.set_data(allocator::malloc(gb.nbytes()));

-  // Finish with the gradient for b in case we had a b.
-  if (gb.ndim() == 1 && gb.size() == axis_size) {
+  // The gradient for b in case we had a b.
+  bool has_gb = (gb.ndim() == 1 && gb.size() == axis_size);
+  if (has_gb) {
    ReductionPlan plan(
        ReductionOpType::ContiguousStridedReduce, {n_rows}, {axis_size});
    col_reduce(encoder, g, gb, Reduce::ReduceType::Sum, {0}, plan);
  }

+  // Insert dependency if `g` was donated
+  if ((g_in_gx || g_in_gw) && has_gb) {
+    encoder.set_input_array(gb);
+  }
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(g);
  encoder.set_output_array(gx);
  encoder.set_output_array(gw_temp);
-  encoder.launch_kernel([&, x = x, g = g](cudaStream_t stream) {
-    MLX_SWITCH_FLOAT_TYPES_CHECKED(gx.dtype(), "layernorm_vjp", CTYPE, {
-      using DataType = cuda_type_t<CTYPE>;
+  dispatch_float_types(gx.dtype(), "layernorm_vjp", [&](auto type_tag) {
+    dispatch_bool(has_w, [&](auto has_w_constant) {
      constexpr int N_READS = 4;
-      MLX_SWITCH_BOOL(has_w, HAS_W, {
-        MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-          auto kernel = cu::layer_norm_vjp<DataType, HAS_W, BLOCK_DIM, N_READS>;
-          kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-              x.data<DataType>(),
-              w.data<DataType>(),
-              g.data<DataType>(),
-              gx.data<DataType>(),
-              gw_temp.data<DataType>(),
-              eps_,
-              axis_size,
-              w_stride);
-        });
-      });
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            auto kernel = cu::layer_norm_vjp<
+                DataType,
+                has_w_constant.value,
+                block_dim(),
+                N_READS>;
+            encoder.add_kernel_node(
+                kernel,
+                n_rows,
+                block_dim(),
+                x.data<DataType>(),
+                w.data<DataType>(),
+                g.data<DataType>(),
+                gx.data<DataType>(),
+                gw_temp.data<DataType>(),
+                eps_,
+                axis_size,
+                w_stride);
+          });
    });
  });

--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -143,15 +143,18 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {

  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_FLOAT_TYPES_CHECKED(out.dtype(), "logsumexp", CTYPE, {
-      using DataType = cuda_type_t<CTYPE>;
-      constexpr int N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        auto kernel = cu::logsumexp<DataType, float, BLOCK_DIM, N_READS>;
-        kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-            in.data<DataType>(), out.data<DataType>(), axis_size);
-      });
+  dispatch_float_types(out.dtype(), "logsumexp", [&](auto type_tag) {
+    constexpr int N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      auto kernel = cu::logsumexp<DataType, float, block_dim(), N_READS>;
+      encoder.add_kernel_node(
+          kernel,
+          n_rows,
+          block_dim(),
+          in.data<DataType>(),
+          out.data<DataType>(),
+          axis_size);
    });
  });
 }
--- a/mlx/backend/cuda/matmul.cpp
+++ b/mlx/backend/cuda/matmul.cpp
@@ -42,7 +42,8 @@ class MatMul {
      int64_t ldb,
      int32_t batch_count,
      int64_t a_batch_stride,
-      int64_t b_batch_stride) {
+      int64_t b_batch_stride)
+      : handle_(device.lt_handle()) {
    heuristic_.state = CUBLAS_STATUS_NOT_INITIALIZED;

    auto scale_type = dtype_to_cuda_type(dtype);
@@ -147,7 +148,7 @@ class MatMul {
    if (heuristic_.state != CUBLAS_STATUS_SUCCESS) {
      int ret = 0;
      CHECK_CUBLAS_ERROR(cublasLtMatmulAlgoGetHeuristic(
-          encoder.device().lt_handle(),
+          handle_,
          matmul_desc_,
          a_desc_,
          b_desc_,
@@ -162,31 +163,34 @@ class MatMul {
      }
    }

-    array workspace(
-        allocator::malloc(heuristic_.workspaceSize),
-        {static_cast<int>(heuristic_.workspaceSize)},
-        int8);
-    encoder.add_temporary(workspace);
+    void* workspace_ptr = nullptr;
+    if (heuristic_.workspaceSize > 0) {
+      array workspace(
+          allocator::malloc(heuristic_.workspaceSize),
+          {static_cast<int>(heuristic_.workspaceSize)},
+          int8);
+      encoder.add_temporary(workspace);
+      workspace_ptr = workspace.data<void>();
+    }

-    encoder.launch_kernel([&](cudaStream_t stream) {
-      CHECK_CUBLAS_ERROR(cublasLtMatmul(
-          encoder.device().lt_handle(),
-          matmul_desc_,
-          &alpha,
-          a,
-          a_desc_,
-          b,
-          b_desc_,
-          &beta,
-          c ? c : out,
-          c ? c_desc_ : out_desc_,
-          out,
-          out_desc_,
-          &heuristic_.algo,
-          workspace.data<void>(),
-          workspace.nbytes(),
-          stream));
-    });
+    auto capture = encoder.capture_context();
+    CHECK_CUBLAS_ERROR(cublasLtMatmul(
+        handle_,
+        matmul_desc_,
+        &alpha,
+        a,
+        a_desc_,
+        b,
+        b_desc_,
+        &beta,
+        c ? c : out,
+        c ? c_desc_ : out_desc_,
+        out,
+        out_desc_,
+        &heuristic_.algo,
+        workspace_ptr,
+        heuristic_.workspaceSize,
+        encoder.stream()));
  }

 private:
@@ -255,6 +259,7 @@ class MatMul {
    return desc;
  }

+  cublasLtHandle_t handle_{nullptr};
  cublasLtMatmulDesc_t matmul_desc_{nullptr};
  cublasLtMatmulPreference_t pref_{nullptr};
  cublasLtMatrixLayout_t a_desc_{nullptr};
@@ -269,7 +274,7 @@ class MatMul {
 namespace {

 std::tuple<bool, int64_t, array>
-check_transpose(std::vector<array>& copies, const Stream& s, const array& arr) {
+check_transpose(cu::CommandEncoder& enc, const Stream& s, const array& arr) {
  auto stx = arr.strides()[arr.ndim() - 2];
  auto sty = arr.strides()[arr.ndim() - 1];
  if (sty == 1 && stx == arr.shape(-1)) {
@@ -279,7 +284,7 @@ check_transpose(std::vector<array>& copies, const Stream& s, const array& arr) {
  } else {
    array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
    copy_gpu(arr, arr_copy, CopyType::General, s);
-    copies.push_back(arr_copy);
+    enc.add_temporary(arr_copy);
    return std::make_tuple(false, arr.shape(-1), arr_copy);
  }
 }
@@ -313,13 +318,8 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
-  std::vector<array> copies;
-  auto [a_transposed, lda, a] = check_transpose(copies, s, a_pre);
-  auto [b_transposed, ldb, b] = check_transpose(copies, s, b_pre);
-
-  for (auto& temp : copies) {
-    encoder.add_temporary(temp);
-  }
+  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);

  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions
@@ -344,7 +344,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  // Invoke cublasLt

  cu::MatMul matmul(
-      encoder.device(),
+      cu::device(s.device),
      a.dtype(),
      a_transposed,
      M,
@@ -358,9 +358,19 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
      a_batch_strides.back(),
      b_batch_strides.back());

+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  auto nbatch = batch_count / batch_shape.back();
+  if (nbatch == 1) {
+    matmul.run(encoder, out.data<int8_t>(), a.data<int8_t>(), b.data<int8_t>());
+    return;
+  }
+
  ContiguousIterator a_it(batch_shape, a_batch_strides, batch_shape.size() - 1);
  ContiguousIterator b_it(batch_shape, b_batch_strides, batch_shape.size() - 1);
-  for (size_t i = 0; i < batch_count / batch_shape.back(); ++i) {
+  auto concurrent = encoder.concurrent_context();
+  for (size_t i = 0; i < nbatch; ++i) {
    matmul.run(
        encoder,
        out.data<int8_t>() + out.itemsize() * i * batch_shape.back() * M * N,
@@ -392,14 +402,9 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
-  std::vector<array> copies;
-  auto [a_transposed, lda, a] = check_transpose(copies, s, a_pre);
-  auto [b_transposed, ldb, b] = check_transpose(copies, s, b_pre);
-  auto [c_transposed, ldc, c] = check_transpose(copies, s, c_pre);
-
-  for (auto& temp : copies) {
-    encoder.add_temporary(temp);
-  }
+  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);
+  auto [c_transposed, ldc, c] = check_transpose(encoder, s, c_pre);

  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions
@@ -427,7 +432,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  // Invoke cublasLt

  cu::MatMul matmul(
-      encoder.device(),
+      cu::device(s.device),
      a.dtype(),
      a_transposed,
      M,
@@ -444,10 +449,29 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
      b_batch_strides.back(),
      c_batch_strides.back());

+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_input_array(c);
+  encoder.set_output_array(out);
+
+  auto nbatch = batch_count / batch_shape.back();
+  if (nbatch == 1) {
+    matmul.run(
+        encoder,
+        out.data<int8_t>(),
+        a.data<int8_t>(),
+        b.data<int8_t>(),
+        c.data<int8_t>(),
+        alpha_,
+        beta_);
+    return;
+  }
+
  ContiguousIterator a_it(batch_shape, a_batch_strides, batch_shape.size() - 1);
  ContiguousIterator b_it(batch_shape, b_batch_strides, batch_shape.size() - 1);
  ContiguousIterator c_it(batch_shape, c_batch_strides, batch_shape.size() - 1);
-  for (size_t i = 0; i < batch_count / batch_shape.back(); ++i) {
+  auto concurrent = encoder.concurrent_context();
+  for (size_t i = 0; i < nbatch; ++i) {
    matmul.run(
        encoder,
        out.data<int8_t>() + out.itemsize() * i * batch_shape.back() * M * N,
--- a/mlx/backend/cuda/primitives.cu
+++ b/mlx/backend/cuda/primitives.cu
@@ -24,22 +24,21 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  if (out.size() == 0) {
    return;
  }
-  auto& s = stream();
-  auto& encoder = cu::get_command_encoder(s);
+  auto& encoder = cu::get_command_encoder(stream());
  encoder.set_output_array(out);
-  encoder.launch_kernel([&, this](cudaStream_t stream) {
-    MLX_SWITCH_INT_FLOAT_TYPES_CHECKED(out.dtype(), "Arange", CTYPE, {
-      using OutType = cuda_type_t<CTYPE>;
-      CTYPE step =
-          static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
-      thrust::transform(
-          cu::thrust_policy(stream),
-          thrust::counting_iterator<uint32_t>(0),
-          thrust::counting_iterator<uint32_t>(out.data_size()),
-          thrust::device_pointer_cast(out.data<OutType>()),
-          cu::Arange<OutType>{
-              static_cast<OutType>(start_), static_cast<OutType>(step)});
-    });
+  auto capture = encoder.capture_context();
+  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
+    using CTYPE = MLX_GET_TYPE(type_tag);
+    using OutType = cuda_type_t<CTYPE>;
+    CTYPE step =
+        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
+    thrust::transform(
+        cu::thrust_policy(encoder.stream()),
+        thrust::counting_iterator<uint32_t>(0),
+        thrust::counting_iterator<uint32_t>(out.data_size()),
+        thrust::device_pointer_cast(out.data<OutType>()),
+        cu::Arange<OutType>{
+            static_cast<OutType>(start_), static_cast<OutType>(step)});
  });
 }

@@ -71,10 +70,8 @@ bool fast::ScaledDotProductAttention::use_fallback(
    throw std::runtime_error(#func " has no CUDA implementation.");   \
  }

-NO_GPU(ArgPartition)
 NO_GPU(BlockMaskedMM)
 NO_GPU(Convolution)
-NO_GPU_MULTI(DivMod)
 NO_GPU(DynamicSlice)
 NO_GPU(DynamicSliceUpdate)
 NO_GPU(FFT)
@@ -83,7 +80,6 @@ NO_GPU(GatherQMM)
 NO_GPU(Hadamard)
 NO_GPU(Load)
 NO_GPU_MULTI(LUF)
-NO_GPU(Partition)
 NO_GPU_MULTI(QRF)
 NO_GPU(QuantizedMatmul)
 NO_GPU(Scan)
--- a/mlx/backend/cuda/random.cu
+++ b/mlx/backend/cuda/random.cu
@@ -156,34 +156,39 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(keys);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dim3 grid_dims{num_keys, half_size + odd};
-    int64_t total = grid_dims.x * grid_dims.y;
-    int32_t threads_y = 1;
-    while ((total / threads_y) >= (1U << 31)) {
-      threads_y *= 2;
-    }
-    int32_t threads_x = cuda::ceil_div(total, threads_y);
-    auto [grid, block] = get_grid_and_block(threads_x, threads_y, 1);
-    if (keys.flags().row_contiguous) {
-      cu::rbitsc<<<grid, block, 0, stream>>>(
-          keys.data<uint32_t>(),
-          out.data<uint8_t>(),
-          grid_dims,
-          odd,
-          bytes_per_key);
-    } else {
-      cu::rbits<<<grid, block, 0, stream>>>(
-          keys.data<uint32_t>(),
-          out.data<uint8_t>(),
-          grid_dims,
-          odd,
-          bytes_per_key,
-          keys.ndim(),
-          const_param(keys.shape()),
-          const_param(keys.strides()));
-    }
-  });
+  dim3 grid_dims{num_keys, half_size + odd};
+  int64_t total = grid_dims.x * grid_dims.y;
+  int32_t threads_y = 1;
+  while ((total / threads_y) >= (1U << 31)) {
+    threads_y *= 2;
+  }
+  int32_t threads_x = cuda::ceil_div(total, threads_y);
+  auto [grid, block] = get_grid_and_block(threads_x, threads_y, 1);
+  auto& stream = encoder.stream();
+  if (keys.flags().row_contiguous) {
+    encoder.add_kernel_node(
+        cu::rbitsc,
+        grid,
+        block,
+        keys.data<uint32_t>(),
+        out.data<uint8_t>(),
+        grid_dims,
+        odd,
+        bytes_per_key);
+  } else {
+    encoder.add_kernel_node(
+        cu::rbits,
+        grid,
+        block,
+        keys.data<uint32_t>(),
+        out.data<uint8_t>(),
+        grid_dims,
+        odd,
+        bytes_per_key,
+        keys.ndim(),
+        const_param(keys.shape()),
+        const_param(keys.strides()));
+  }
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/reduce.cu
+++ b/mlx/backend/cuda/reduce.cu
@@ -21,28 +21,11 @@ void Reduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(!axes_.empty());
  assert(out.size() != in.size());

-  out.set_data(allocator::malloc(out.nbytes()));
-
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  encoder.set_input_array(in);
-  encoder.set_output_array(out);

-  // Fill out with init value.
  if (in.size() == 0) {
-    encoder.launch_kernel([&](cudaStream_t stream) {
-      MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE, {
-        MLX_SWITCH_REDUCE_OPS(reduce_type_, OP, {
-          using InType = cuda_type_t<CTYPE>;
-          using OutType = cu::ReduceResult<OP, InType>::type;
-          thrust::fill_n(
-              cu::thrust_policy(stream),
-              thrust::device_pointer_cast(out.data<OutType>()),
-              out.data_size(),
-              cu::ReduceInit<OP, InType>::value());
-        });
-      });
-    });
+    init_reduce(encoder, in, out, reduce_type_);
    return;
  }

@@ -51,7 +34,19 @@ void Reduce::eval_gpu(const std::vector<array>& inputs, array& out) {

  // If it is a general reduce then copy the input to a contiguous array and
  // recompute the plan.
-  if (plan.type == GeneralReduce) {
+  //
+  // TODO: Instead of copying we can use elem-to-loc to deal with broadcasting
+  //       like we do in Metal. When it comes to broadcasted reduction axes
+  //       some can be ignored eg for min/max.
+  bool broadcasted = false;
+  for (int i = 0, j = 0; i < in.ndim() && !broadcasted; i++) {
+    if (j < axes_.size() && axes_[j] == i) {
+      j++;
+    } else {
+      broadcasted = in.strides(i) == 0;
+    }
+  }
+  if (plan.type == GeneralReduce || broadcasted || !in.flags().contiguous) {
    array in_copy(in.shape(), in.dtype(), nullptr, {});
    copy_gpu(in, in_copy, CopyType::General, s);
    encoder.add_temporary(in_copy);
@@ -59,9 +54,8 @@ void Reduce::eval_gpu(const std::vector<array>& inputs, array& out) {
    plan = get_reduction_plan(in, axes_);
  }

-  if ((plan.type == ContiguousAllReduce) ||
-      (plan.type == ContiguousReduce && plan.shape.size() == 1)) {
-    segmented_reduce(encoder, in, out, reduce_type_, axes_, plan);
+  if (plan.type == ContiguousAllReduce) {
+    all_reduce(encoder, in, out, reduce_type_);
    return;
  }

--- a/mlx/backend/cuda/reduce/all_reduce.cu
+++ b/mlx/backend/cuda/reduce/all_reduce.cu
@@ -0,0 +1,157 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/reduce/reduce.cuh"
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <cub/block/block_load.cuh>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename T, typename U, typename ReduceOp, int N = 4>
+__global__ void all_reduce(T* in, U* out, size_t block_step, size_t size) {
+  // TODO: Process multiple "rows" in each thread
+  constexpr int M = 1;
+
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto warp = cg::tiled_partition<WARP_SIZE>(block);
+
+  const U init = cu::ReduceInit<ReduceOp, T>::value();
+  ReduceOp op;
+
+  T vals[N];
+  U accs[M];
+  accs[0] = init;
+
+  size_t start = grid.block_rank() * block_step;
+  size_t end = start + block_step;
+  size_t check = min(end, size);
+
+  size_t i = start;
+  for (; i + block.size() * N <= check; i += block.size() * N) {
+    cub::LoadDirectBlockedVectorized<T, N>(block.thread_rank(), in + i, vals);
+    for (int j = 0; j < N; j++) {
+      accs[0] = op(accs[0], __cast<U, T>(vals[j]));
+    }
+  }
+
+  if (i < check) {
+    cub::LoadDirectBlocked(
+        block.thread_rank(), in + i, vals, check - i, __cast<T, U>(init));
+    for (int i = 0; i < N; i++) {
+      accs[0] = op(accs[0], __cast<U, T>(vals[i]));
+    }
+  }
+
+  __shared__ U shared_accumulators[32];
+  block_reduce(block, warp, accs, shared_accumulators, op, init);
+
+  if (block.thread_rank() == 0) {
+    out[grid.block_rank()] = accs[0];
+  }
+}
+
+} // namespace cu
+
+void all_reduce(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    array& out,
+    Reduce::ReduceType reduce_type) {
+  constexpr int N_READS = 8;
+
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  auto get_args = [](size_t size, int N) {
+    int threads = std::min(512UL, (size + N - 1) / N);
+    threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+    int reductions_per_step = threads * N;
+    size_t steps_needed =
+        (size + reductions_per_step - 1) / reductions_per_step;
+
+    int blocks;
+    if (steps_needed < 32) {
+      blocks = 1;
+    } else if (steps_needed < 128) {
+      blocks = 32;
+    } else if (steps_needed < 512) {
+      blocks = 128;
+    } else if (steps_needed < 1024) {
+      blocks = 512;
+    } else {
+      blocks = 1024;
+    }
+
+    size_t steps_per_block = (steps_needed + blocks - 1) / blocks;
+    size_t block_step = steps_per_block * reductions_per_step;
+
+    return std::make_tuple(blocks, threads, block_step);
+  };
+
+  int blocks, threads;
+  size_t block_step;
+  size_t insize = in.size();
+  Dtype dt = in.dtype();
+
+  // Cub doesn't like const pointers for load (sigh).
+  void* indata = const_cast<void*>(in.data<void>());
+
+  // Large array so allocate an intermediate and accumulate there
+  std::tie(blocks, threads, block_step) = get_args(insize, N_READS);
+  encoder.set_input_array(in);
+  if (blocks > 1) {
+    array intermediate({blocks}, out.dtype(), nullptr, {});
+    intermediate.set_data(allocator::malloc(intermediate.nbytes()));
+    encoder.add_temporary(intermediate);
+    encoder.set_output_array(intermediate);
+    dispatch_all_types(dt, [&](auto type_tag) {
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        using U = typename cu::ReduceResult<OP, T>::type;
+        auto kernel = cu::all_reduce<T, U, OP, N_READS>;
+        encoder.add_kernel_node(
+            kernel,
+            blocks,
+            threads,
+            static_cast<T*>(indata),
+            intermediate.data<U>(),
+            block_step,
+            insize);
+      });
+    });
+
+    // Set the input for the next step and recalculate the blocks
+    indata = intermediate.data<void>();
+    dt = intermediate.dtype();
+    insize = intermediate.size();
+    std::tie(blocks, threads, block_step) = get_args(insize, N_READS);
+    encoder.set_input_array(intermediate);
+  }
+
+  encoder.set_output_array(out);
+  dispatch_all_types(dt, [&](auto type_tag) {
+    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+      using OP = MLX_GET_TYPE(reduce_type_tag);
+      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      using U = typename cu::ReduceResult<OP, T>::type;
+      auto kernel = cu::all_reduce<T, U, OP, N_READS>;
+      encoder.add_kernel_node(
+          kernel,
+          blocks,
+          threads,
+          static_cast<T*>(indata),
+          out.data<U>(),
+          block_step,
+          insize);
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/reduce/col_reduce.cu
+++ b/mlx/backend/cuda/reduce/col_reduce.cu
@@ -1,5 +1,7 @@
 // Copyright © 2025 Apple Inc.

+#include <numeric>
+
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/reduce/reduce.cuh"
@@ -36,19 +38,36 @@ struct ColReduceArgs {
      const array& in,
      const ReductionPlan& plan,
      const std::vector<int>& axes) {
+    using ShapeVector = decltype(plan.shape);
+    using StridesVector = decltype(plan.strides);
+
+    ShapeVector shape_vec;
+    StridesVector strides_vec;
+
    assert(!plan.shape.empty());
    reduction_size = plan.shape.back();
    reduction_stride = plan.strides.back();

    int64_t stride_back = 1;
-    auto [shape_vec, strides_vec] = shapes_without_reduction_axes(in, axes);
+    std::tie(shape_vec, strides_vec) = shapes_without_reduction_axes(in, axes);
    while (!shape_vec.empty() && stride_back < reduction_stride) {
      stride_back *= shape_vec.back();
      shape_vec.pop_back();
      strides_vec.pop_back();
    }
+    std::vector<int> indices(shape_vec.size());
+    std::iota(indices.begin(), indices.end(), 0);
+    std::sort(indices.begin(), indices.end(), [&](int left, int right) {
+      return strides_vec[left] > strides_vec[right];
+    });
+    ShapeVector sorted_shape;
+    StridesVector sorted_strides;
+    for (auto idx : indices) {
+      sorted_shape.push_back(shape_vec[idx]);
+      sorted_strides.push_back(strides_vec[idx]);
+    }
    std::tie(shape_vec, strides_vec) =
-        collapse_contiguous_dims(shape_vec, strides_vec);
+        collapse_contiguous_dims(sorted_shape, sorted_strides);
    shape = const_param(shape_vec);
    strides = const_param(strides_vec);
    ndim = shape_vec.size();
@@ -64,86 +83,6 @@ struct ColReduceArgs {
  }
 };

-template <typename T, typename U, typename Op, int NDIM, int N_READS = 4>
-__global__ void col_reduce_small(
-    const T* in,
-    U* out,
-    const __grid_constant__ ColReduceArgs args) {
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-
-  int column =
-      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
-  if (column * N_READS >= args.reduction_stride) {
-    return;
-  }
-
-  int out_idx = grid.block_rank() / grid.dim_blocks().x;
-  in += elem_to_loc(out_idx, args.shape.data(), args.strides.data(), args.ndim);
-
-  Op op;
-  U totals[N_READS];
-  for (int i = 0; i < N_READS; i++) {
-    totals[i] = ReduceInit<Op, T>::value();
-  }
-
-  // Read input to local.
-  LoopedElemToLoc<NDIM, (NDIM > 2)> loop(args.reduce_ndim);
-  loop.next(
-      block.thread_index().y,
-      args.reduce_shape.data(),
-      args.reduce_strides.data());
-  for (size_t r = block.thread_index().y;
-       r < args.non_col_reductions * args.reduction_size;
-       r += block.dim_threads().y) {
-    U vals[N_READS];
-    cub::LoadDirectBlocked(
-        column,
-        make_cast_iterator<U>(in + loop.location()),
-        vals,
-        args.reduction_stride,
-        ReduceInit<Op, T>::value());
-    for (int i = 0; i < N_READS; i++) {
-      totals[i] = op(vals[i], totals[i]);
-    }
-    loop.next(
-        block.dim_threads().y,
-        args.reduce_shape.data(),
-        args.reduce_strides.data());
-  }
-
-  // Do block reduce when each column has more than 1 element to reduce.
-  if (block.dim_threads().y > 1) {
-    __shared__ U shared_vals[32 * 8 * N_READS];
-    size_t col =
-        block.thread_index().y * block.dim_threads().x + block.thread_index().x;
-    for (int i = 0; i < N_READS; i++) {
-      shared_vals[col * N_READS + i] = totals[i];
-    }
-    block.sync();
-    if (block.thread_index().y == 0) {
-      for (int i = 0; i < N_READS; i++) {
-        totals[i] = shared_vals[block.thread_index().x * N_READS + i];
-      }
-      for (int j = 1; j < block.dim_threads().y; j++) {
-        col = j * block.dim_threads().x + block.thread_index().x;
-        for (int i = 0; i < N_READS; i++) {
-          totals[i] = op(shared_vals[col * N_READS + i], totals[i]);
-        }
-      }
-    }
-  }
-
-  // Write result.
-  if (block.thread_index().y == 0) {
-    cub::StoreDirectBlocked(
-        column,
-        out + out_idx * args.reduction_stride,
-        totals,
-        args.reduction_stride);
-  }
-}
-
 template <
    typename T,
    typename U,
@@ -152,67 +91,94 @@ template <
    int BM,
    int BN,
    int N_READS = 4>
-__global__ void col_reduce_looped(
-    const T* in,
-    U* out,
-    const __grid_constant__ ColReduceArgs args) {
+__global__ void
+col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

-  constexpr int n_warps = BN / N_READS;
+  constexpr int threads_per_row = BN / N_READS;

-  int out_idx = grid.block_rank() / grid.dim_blocks().x;
-  in += elem_to_loc(out_idx, args.shape.data(), args.strides.data(), args.ndim);
+  // Compute the indices for the tile
+  size_t tile_idx = grid.block_rank();
+  size_t tile_x = tile_idx % ((args.reduction_stride + BN - 1) / BN);
+  size_t tile_y = tile_idx / ((args.reduction_stride + BN - 1) / BN);

+  // Compute the indices for the thread within the tile
+  short thread_x = block.thread_rank() % threads_per_row;
+  short thread_y = block.thread_rank() / threads_per_row;
+
+  // Move the input pointer
+  in += elem_to_loc(tile_y, args.shape.data(), args.strides.data(), args.ndim) +
+      tile_x * BN;
+
+  // Initialize the running totals
  Op op;
  U totals[N_READS];
  for (int i = 0; i < N_READS; i++) {
    totals[i] = ReduceInit<Op, T>::value();
  }

-  // Read input to local.
-  int r = block.thread_rank() / n_warps;
-  int column = block.thread_rank() % n_warps;
-  int in_offset = grid.block_index().x * BN;
  LoopedElemToLoc<NDIM, (NDIM > 2)> loop(args.reduce_ndim);
-  loop.next(r, args.reduce_shape.data(), args.reduce_strides.data());
-  for (; r < args.non_col_reductions * args.reduction_size; r += BM) {
-    U vals[N_READS];
-    cub::LoadDirectBlocked(
-        column,
-        make_cast_iterator<U>(in + loop.location() + in_offset),
-        vals,
-        args.reduction_stride - in_offset,
-        ReduceInit<Op, T>::value());
-    for (int i = 0; i < N_READS; i++) {
-      totals[i] = op(vals[i], totals[i]);
+  loop.next(thread_y, args.reduce_shape.data(), args.reduce_strides.data());
+  size_t total = args.non_col_reductions * args.reduction_size;
+  if (tile_x * BN + BN <= args.reduction_stride) {
+    if (args.reduction_stride % N_READS == 0) {
+      for (size_t r = thread_y; r < total; r += BM) {
+        T vals[N_READS];
+        cub::LoadDirectBlockedVectorized(thread_x, in + loop.location(), vals);
+        for (int i = 0; i < N_READS; i++) {
+          totals[i] = op(totals[i], __cast<U, T>(vals[i]));
+        }
+        loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
+      }
+    } else {
+      for (size_t r = thread_y; r < total; r += BM) {
+        T vals[N_READS];
+        cub::LoadDirectBlocked(thread_x, in + loop.location(), vals);
+        for (int i = 0; i < N_READS; i++) {
+          totals[i] = op(totals[i], __cast<U, T>(vals[i]));
+        }
+        loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
+      }
+    }
+  } else {
+    for (size_t r = thread_y; r < total; r += BM) {
+      T vals[N_READS];
+      cub::LoadDirectBlocked(
+          thread_x,
+          in + loop.location(),
+          vals,
+          args.reduction_stride - tile_x * BN,
+          __cast<T, U>(ReduceInit<Op, T>::value()));
+      for (int i = 0; i < N_READS; i++) {
+        totals[i] = op(totals[i], __cast<U, T>(vals[i]));
+      }
+      loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
    }
-    loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
  }

  // Do warp reduce for each output.
-  constexpr int n_outputs = BN / n_warps;
+  constexpr int n_outputs = BN / threads_per_row;
  static_assert(BM == 32 && n_outputs == N_READS);
  __shared__ U shared_vals[BM * BN];
-  size_t col = block.thread_index().y * BN + block.thread_index().x * N_READS;
+  short s_idx = thread_y * BN + thread_x * N_READS;
  for (int i = 0; i < N_READS; i++) {
-    shared_vals[col + i] = totals[i];
+    shared_vals[s_idx + i] = totals[i];
  }
  block.sync();
-  col = warp.thread_rank() * BN + warp.meta_group_rank() * n_outputs;
+  s_idx = warp.thread_rank() * BN + warp.meta_group_rank() * n_outputs;
  for (int i = 0; i < n_outputs; i++) {
-    totals[i] = cg::reduce(warp, shared_vals[col + i], op);
+    totals[i] = cg::reduce(warp, shared_vals[s_idx + i], op);
  }

  // Write result.
  if (warp.thread_rank() == 0) {
-    size_t out_offset = grid.block_index().x * BN;
    cub::StoreDirectBlocked(
        warp.meta_group_rank(),
-        out + out_idx * args.reduction_stride + out_offset,
+        out + tile_y * args.reduction_stride + tile_x * BN,
        totals,
-        args.reduction_stride - out_offset);
+        args.reduction_stride - tile_x * BN);
  }
 }

@@ -220,14 +186,55 @@ __global__ void col_reduce_looped(

 inline auto output_grid_for_col_reduce(
    const array& out,
-    const cu::ColReduceArgs& args) {
-  auto out_shape = out.shape();
-  auto out_strides = out.strides();
-  while (!out_shape.empty() && out_strides.back() < args.reduction_stride) {
-    out_shape.pop_back();
-    out_strides.pop_back();
+    const cu::ColReduceArgs& args,
+    int bn) {
+  int gx, gy = 1;
+  size_t n_inner_blocks = cuda::ceil_div(args.reduction_stride, bn);
+  size_t n_outer_blocks = out.size() / args.reduction_stride;
+  size_t n_blocks = n_outer_blocks * n_inner_blocks;
+  while (n_blocks / gy > INT32_MAX) {
+    gy *= 2;
  }
-  return get_2d_grid_dims(out_shape, out_strides);
+  gx = cuda::ceil_div(n_blocks, gy);
+
+  return dim3(gx, gy, 1);
+}
+
+void col_reduce_looped(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    array& out,
+    Reduce::ReduceType reduce_type,
+    const std::vector<int>& axes,
+    const ReductionPlan& plan,
+    cu::ColReduceArgs args) {
+  // Allocate data for the output using in's layout to access them as
+  // contiguously as possible.
+  allocate_same_layout(out, in, axes);
+
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        using U = typename cu::ReduceResult<OP, T>::type;
+        // Cub doesn't like const pointers for vectorized loads. (sigh)
+        T* indata = const_cast<T*>(in.data<T>());
+
+        constexpr int N_READS = 4;
+        constexpr int BM = 32;
+        constexpr int BN = 32;
+        dim3 grid = output_grid_for_col_reduce(out, args, BN);
+        int blocks = BM * BN / N_READS;
+        auto kernel =
+            cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
+        encoder.add_kernel_node(
+            kernel, grid, blocks, indata, out.data<U>(), args);
+      });
+    });
+  });
 }

 void col_reduce(
@@ -237,42 +244,23 @@ void col_reduce(
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan) {
+  // Current col reduce options
+  //
+  // - col_reduce_looped
+  //
+  //   It is a general strided reduce. Each threadblock computes the output for
+  //   a subrow of the fast moving axis. For instance 32 elements.
+  //
+  // Notes: As in row reduce we opt to read as much in order as possible and
+  //        leave transpositions as they are (contrary to our Metal backend).
+  //
+  //        Moreover we need different kernels for short rows and tuning
+
+  // Make the args struct to help route to the best kernel
  cu::ColReduceArgs args(in, plan, axes);

-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE, {
-      using InType = cuda_type_t<CTYPE>;
-      MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
-        using OutType = cu::ReduceResult<OP, InType>::type;
-        MLX_SWITCH_REDUCE_NDIM(args.reduce_ndim, NDIM, {
-          constexpr int N_READS = 4;
-          dim3 block_dims;
-          dim3 num_blocks = output_grid_for_col_reduce(out, args);
-          num_blocks.z = num_blocks.y;
-          num_blocks.y = num_blocks.x;
-          auto kernel =
-              cu::col_reduce_small<InType, OutType, OP, NDIM, N_READS>;
-          size_t total = args.non_col_reductions * args.reduction_size;
-          if (total < 32) {
-            size_t stride_blocks =
-                cuda::ceil_div(args.reduction_stride, N_READS);
-            block_dims.x = std::min(stride_blocks, 32ul);
-            block_dims.y = std::min(total, 8ul);
-            num_blocks.x = cuda::ceil_div(stride_blocks, block_dims.x);
-          } else {
-            constexpr int BM = 32;
-            constexpr int BN = 32;
-            block_dims.x = BM * BN / N_READS;
-            num_blocks.x = cuda::ceil_div(args.reduction_stride, BN);
-            kernel = cu::
-                col_reduce_looped<InType, OutType, OP, NDIM, BM, BN, N_READS>;
-          }
-          kernel<<<num_blocks, block_dims, 0, stream>>>(
-              in.data<InType>(), out.data<OutType>(), args);
-        });
-      });
-    });
-  });
+  // Fallback col reduce
+  col_reduce_looped(encoder, in, out, reduce_type, axes, plan, args);
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/reduce/init_reduce.cu
+++ b/mlx/backend/cuda/reduce/init_reduce.cu
@@ -0,0 +1,49 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/reduce/reduce.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename T, typename U, typename Op>
+__global__ void init_reduce(U* out, size_t size) {
+  auto index = cg::this_grid().thread_rank();
+  if (index < size) {
+    out[index] = ReduceInit<Op, T>::value();
+  }
+}
+
+} // namespace cu
+
+void init_reduce(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    array& out,
+    Reduce::ReduceType reduce_type) {
+  // Allocate if needed
+  if (out.data_shared_ptr() == nullptr) {
+    out.set_data(allocator::malloc(out.nbytes()));
+  }
+
+  encoder.set_output_array(out);
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+      using OP = MLX_GET_TYPE(reduce_type_tag);
+      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      using U = typename cu::ReduceResult<OP, T>::type;
+      auto kernel = cu::init_reduce<T, U, OP>;
+      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
+      dim3 block(grid.x < 1024 ? grid.x : 1024, 1, 1);
+      grid.x = (grid.x + 1023) / 1024;
+      encoder.add_kernel_node(kernel, grid, block, out.data<U>(), out.size());
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/reduce/reduce.cuh
+++ b/mlx/backend/cuda/reduce/reduce.cuh
@@ -1,5 +1,7 @@
 // Copyright © 2025 Apple Inc.

+#include <type_traits>
+
 #include "mlx/backend/common/reduce.h"
 #include "mlx/backend/cuda/device/cucomplex_math.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
@@ -9,51 +11,41 @@

 namespace mlx::core {

-// Dispatch dynamic ndim to constexpr.
-// The behavior follows get_kernel_reduce_ndim in metal/reduce.cpp file.
-#define MLX_SWITCH_REDUCE_NDIM(ndim, NDIM, ...) \
-  if (ndim == 1) {                              \
-    constexpr uint32_t NDIM = 1;                \
-    __VA_ARGS__;                                \
-  } else if (ndim == 2) {                       \
-    constexpr uint32_t NDIM = 2;                \
-    __VA_ARGS__;                                \
-  } else {                                      \
-    constexpr uint32_t NDIM = 5;                \
-    __VA_ARGS__;                                \
+template <typename F>
+void dispatch_reduce_ndim(int ndim, F&& f) {
+  if (ndim == 1) {
+    f(std::integral_constant<int, 1>{});
+  } else if (ndim == 2) {
+    f(std::integral_constant<int, 2>{});
+  } else {
+    f(std::integral_constant<int, 5>{});
  }
+}

-// Dispatch reduce ops to constexpr.
-#define MLX_SWITCH_REDUCE_OPS(REDUCE, OP, ...)           \
-  if (REDUCE == Reduce::ReduceType::And) {               \
-    using OP = cu::And;                                  \
-    __VA_ARGS__;                                         \
-  } else if (REDUCE == Reduce::ReduceType::Or) {         \
-    using OP = cu::Or;                                   \
-    __VA_ARGS__;                                         \
-  } else if (REDUCE == Reduce::ReduceType::Sum) {        \
-    using OP = cu::Sum;                                  \
-    __VA_ARGS__;                                         \
-  } else if (REDUCE == Reduce::ReduceType::Prod) {       \
-    using OP = cu::Prod;                                 \
-    __VA_ARGS__;                                         \
-  } else if (REDUCE == Reduce::ReduceType::Max) {        \
-    using OP = cu::Max;                                  \
-    __VA_ARGS__;                                         \
-  } else if (REDUCE == Reduce::ReduceType::Min) {        \
-    using OP = cu::Min;                                  \
-    __VA_ARGS__;                                         \
-  } else {                                               \
-    throw std::invalid_argument("Unknown reduce type."); \
+template <typename F>
+void dispatch_reduce_ops(Reduce::ReduceType reduce_type, F&& f) {
+  if (reduce_type == Reduce::ReduceType::And) {
+    f(type_identity<cu::And>{});
+  } else if (reduce_type == Reduce::ReduceType::Or) {
+    f(type_identity<cu::Or>{});
+  } else if (reduce_type == Reduce::ReduceType::Sum) {
+    f(type_identity<cu::Sum>{});
+  } else if (reduce_type == Reduce::ReduceType::Prod) {
+    f(type_identity<cu::Prod>{});
+  } else if (reduce_type == Reduce::ReduceType::Max) {
+    f(type_identity<cu::Max>{});
+  } else if (reduce_type == Reduce::ReduceType::Min) {
+    f(type_identity<cu::Min>{});
+  } else {
+    throw std::invalid_argument("Unknown reduce type.");
  }
+}

-void segmented_reduce(
+void all_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
-    Reduce::ReduceType reduce_type,
-    const std::vector<int>& axes,
-    const ReductionPlan& plan);
+    Reduce::ReduceType reduce_type);

 void row_reduce(
    cu::CommandEncoder& encoder,
@@ -71,4 +63,10 @@ void col_reduce(
    const std::vector<int>& axes,
    const ReductionPlan& plan);

+void init_reduce(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    array& out,
+    Reduce::ReduceType reduce_type);
+
 } // namespace mlx::core
--- a/mlx/backend/cuda/reduce/reduce_ops.cuh
+++ b/mlx/backend/cuda/reduce/reduce_ops.cuh
@@ -3,48 +3,89 @@
 #pragma once

 #include "mlx/backend/cuda/device/utils.cuh"
+#include "mlx/backend/cuda/reduce/reduce_utils.cuh"

 namespace mlx::core::cu {

 // Reduce ops.
 struct And {
-  __device__ bool operator()(bool a, bool b) {
+  __device__ __forceinline__ bool operator()(bool a, bool b) {
    return a && b;
  }
+
+  __device__ void atomic_update(bool* x, bool y) {
+    atomic_reduce<bool, And>(x, y);
+  }
 };

 struct Or {
-  __device__ bool operator()(bool a, bool b) {
+  __device__ __forceinline__ bool operator()(bool a, bool b) {
    return a || b;
  }
+
+  __device__ void atomic_update(bool* x, bool y) {
+    atomic_reduce<bool, Or>(x, y);
+  }
 };

 struct Sum {
  template <typename T>
-  __device__ T operator()(T a, T b) {
+  __device__ __forceinline__ T operator()(T a, T b) {
    return a + b;
  }
+
+  template <typename T>
+  __device__ void atomic_update(T* x, T y) {
+    atomic_reduce<T, Sum>(x, y);
+  }
+
+  __device__ void atomic_update(__nv_bfloat16* x, __nv_bfloat16 y) {
+    atomicAdd(x, y);
+  }
+
+  __device__ void atomic_update(int* x, int y) {
+    atomicAdd(x, y);
+  }
+
+  __device__ void atomic_update(float* x, float y) {
+    atomicAdd(x, y);
+  }
 };

 struct Prod {
  template <typename T>
-  __device__ T operator()(T a, T b) {
+  __device__ __forceinline__ T operator()(T a, T b) {
    return a * b;
  }
+
+  template <typename T>
+  __device__ void atomic_update(T* x, T y) {
+    atomic_reduce<T, Prod>(x, y);
+  }
 };

 struct Min {
  template <typename T>
-  __device__ T operator()(T a, T b) {
+  __device__ __forceinline__ T operator()(T a, T b) {
    return a < b ? a : b;
  }
+
+  template <typename T>
+  __device__ void atomic_update(T* x, T y) {
+    atomic_reduce<T, Min>(x, y);
+  }
 };

 struct Max {
  template <typename T>
-  __device__ T operator()(T a, T b) {
+  __device__ __forceinline__ T operator()(T a, T b) {
    return a > b ? a : b;
  }
+
+  template <typename T>
+  __device__ void atomic_update(T* x, T y) {
+    atomic_reduce<T, Max>(x, y);
+  }
 };

 // Traits to get the result type of reduce op.
@@ -120,7 +161,7 @@ template <typename T>
 struct ReduceInit<Prod, T> {
  static constexpr __host__ __device__ auto value() {
    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      return T{1, 1};
+      return T{1, 0};
    } else {
      return typename ReduceResult<Prod, T>::type{1};
    }
--- a/mlx/backend/cuda/reduce/reduce_utils.cuh
+++ b/mlx/backend/cuda/reduce/reduce_utils.cuh
@@ -0,0 +1,158 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <numeric>
+
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <size_t N>
+struct uint_by_size;
+template <>
+struct uint_by_size<2> {
+  using type = uint16_t;
+};
+template <>
+struct uint_by_size<4> {
+  using type = uint32_t;
+};
+template <>
+struct uint_by_size<8> {
+  using type = unsigned long long int;
+};
+
+template <typename T, typename Op>
+__device__ void atomic_reduce(T* x, T y) {
+  if constexpr (sizeof(T) == 1) {
+    using U = uint16_t;
+    U* x_int = (U*)((char*)x - ((size_t)x % 2));
+    int shift = ((char*)x - (char*)x_int) * 8;
+    int mask = 0xff << shift;
+    U old_val, new_val;
+    do {
+      old_val = *x_int;
+      T result = Op{}(static_cast<T>((old_val >> shift) & 0xff), y);
+      new_val = (old_val & ~mask) | (result << shift);
+    } while (atomicCAS(x_int, old_val, new_val) != old_val);
+  } else {
+    using U = typename uint_by_size<sizeof(T)>::type;
+    U* x_int = (U*)(x);
+    U old_val, new_val;
+    do {
+      old_val = *x_int;
+      T result = Op{}(*((T*)&old_val), y);
+      new_val = *((U*)&result);
+    } while (atomicCAS(x_int, old_val, new_val) != old_val);
+  }
+}
+
+// TODO: Should make a custom complex type
+template <typename U, typename T>
+inline __device__ U __cast(T x) {
+  return static_cast<U>(x);
+}
+
+template <>
+inline __device__ bool __cast<bool, cuComplex>(cuComplex x) {
+  return x.x != 0 && x.y != 0;
+}
+
+template <>
+inline __device__ cuComplex __cast<cuComplex, bool>(bool x) {
+  return x ? make_cuFloatComplex(1, 1) : make_cuFloatComplex(0, 0);
+}
+
+template <typename T, int N, typename Block, typename Warp, typename Op>
+inline __device__ void
+block_reduce(Block block, Warp warp, T (&vals)[N], T* smem, Op op, T init) {
+  // First reduce in the current warp
+  for (int i = 0; i < N; i++) {
+    vals[i] = cg::reduce(warp, vals[i], op);
+  }
+
+  // Reduce across warps
+  if (warp.meta_group_size() > 1) {
+    if (warp.thread_rank() == 0) {
+      for (int i = 0; i < N; i++) {
+        smem[warp.meta_group_rank() * N + i] = vals[i];
+      }
+    }
+    block.sync();
+    if (warp.thread_rank() < warp.meta_group_size()) {
+      for (int i = 0; i < N; i++) {
+        vals[i] = smem[warp.thread_rank() * N + i];
+      }
+    } else {
+      for (int i = 0; i < N; i++) {
+        vals[i] = init;
+      }
+    }
+    for (int i = 0; i < N; i++) {
+      vals[i] = cg::reduce(warp, vals[i], op);
+    }
+  }
+}
+
+} // namespace cu
+
+inline void allocate_same_layout(
+    array& out,
+    const array& in,
+    const std::vector<int>& axes) {
+  if (in.flags().row_contiguous) {
+    out.set_data(allocator::malloc(out.nbytes()));
+    return;
+  }
+
+  if (out.ndim() < in.ndim()) {
+    throw std::runtime_error(
+        "Reduction without keepdims only supported for row-contiguous inputs");
+  }
+
+  // Calculate the transpositions applied to in in order to apply them to out.
+  std::vector<int> axis_order(in.ndim());
+  std::iota(axis_order.begin(), axis_order.end(), 0);
+  std::sort(axis_order.begin(), axis_order.end(), [&](int left, int right) {
+    return in.strides(left) > in.strides(right);
+  });
+
+  // Transpose the shape and calculate the strides
+  Shape out_shape(in.ndim());
+  Strides out_strides(in.ndim(), 1);
+  for (int i = 0; i < in.ndim(); i++) {
+    out_shape[i] = out.shape(axis_order[i]);
+  }
+  for (int i = in.ndim() - 2; i >= 0; i--) {
+    out_strides[i] = out_shape[i + 1] * out_strides[i + 1];
+  }
+
+  // Reverse the axis order to get the final strides
+  Strides final_strides(in.ndim());
+  for (int i = 0; i < in.ndim(); i++) {
+    final_strides[axis_order[i]] = out_strides[i];
+  }
+
+  // Calculate the resulting contiguity and do the memory allocation
+  auto [data_size, rc, cc] = check_contiguity(out.shape(), final_strides);
+  auto fl = in.flags();
+  fl.row_contiguous = rc;
+  fl.col_contiguous = cc;
+  fl.contiguous = true;
+  out.set_data(
+      allocator::malloc(out.nbytes()),
+      data_size,
+      final_strides,
+      fl,
+      allocator::free);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/reduce/row_reduce.cu
+++ b/mlx/backend/cuda/reduce/row_reduce.cu
@@ -1,5 +1,7 @@
 // Copyright © 2025 Apple Inc.

+#include <numeric>
+
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/reduce/reduce.cuh"
@@ -55,84 +57,108 @@ struct RowReduceArgs {
      non_row_reductions *= reduce_shape[i];
    }
  }
+
+  // Convert shape and strides as if in was contiguous
+  void sort_access_pattern(const array& in, const std::vector<int>& axes) {
+    auto shape_vec = in.shape();
+    auto strides_vec = in.strides();
+    std::tie(shape_vec, strides_vec) =
+        shapes_without_reduction_axes(shape_vec, strides_vec, axes);
+    std::vector<int> indices(shape_vec.size());
+    std::iota(indices.begin(), indices.end(), 0);
+    std::sort(indices.begin(), indices.end(), [&](int left, int right) {
+      return strides_vec[left] > strides_vec[right];
+    });
+    decltype(shape_vec) sorted_shape;
+    decltype(strides_vec) sorted_strides;
+    for (auto idx : indices) {
+      sorted_shape.push_back(shape_vec[idx]);
+      sorted_strides.push_back(strides_vec[idx]);
+    }
+    std::tie(shape_vec, strides_vec) =
+        collapse_contiguous_dims(sorted_shape, sorted_strides);
+    shape = const_param(shape_vec);
+    strides = const_param(strides_vec);
+    ndim = shape_vec.size();
+  }
 };

-template <typename T, typename U, typename Op, int NDIM, int N_READS = 4>
-__global__ void row_reduce_small(
-    const T* in,
-    U* out,
-    size_t out_size,
-    const __grid_constant__ RowReduceArgs args) {
-  size_t out_idx = cg::this_grid().thread_rank();
-  if (out_idx >= out_size) {
-    return;
-  }
-
-  Op op;
-
-  U total_val = ReduceInit<Op, T>::value();
-  LoopedElemToLoc<NDIM, (NDIM > 2)> loop(args.reduce_ndim);
-
-  in += elem_to_loc(out_idx, args.shape.data(), args.strides.data(), args.ndim);
-
-  for (size_t n = 0; n < args.non_row_reductions; n++) {
-    for (int r = 0; r < cuda::ceil_div(args.row_size, N_READS); r++) {
-      U vals[N_READS];
-      cub::LoadDirectBlocked(
-          r,
-          make_cast_iterator<U>(in + loop.location()),
-          vals,
-          args.row_size,
-          ReduceInit<Op, T>::value());
-      total_val = op(total_val, cub::ThreadReduce(vals, op));
-    }
-    loop.next(args.reduce_shape.data(), args.reduce_strides.data());
-  }
-
-  out[out_idx] = total_val;
-}
-
-template <typename T, typename U, typename Op, int NDIM, int N_READS = 4>
-__global__ void row_reduce_small_warp(
-    const T* in,
-    U* out,
-    size_t out_size,
-    const __grid_constant__ RowReduceArgs args) {
+template <typename T, typename U, typename ReduceOp, int N = 4, int M = 1>
+__global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

-  size_t out_idx = grid.thread_rank() / WARP_SIZE;
-  if (out_idx >= out_size) {
-    return;
+  const U init = cu::ReduceInit<ReduceOp, T>::value();
+  ReduceOp op;
+
+  T vals[M][N];
+  U accs[M];
+  for (int i = 0; i < M; i++) {
+    accs[i] = init;
  }

-  Op op;
+  const size_t start_row =
+      min(n_rows - M, static_cast<size_t>(grid.block_rank() * M));
+  const size_t full_blocks = size / (block.size() * N);
+  const size_t final_offset = full_blocks * (block.size() * N);
+  in += start_row * size;
+  out += start_row;

-  U total_val = ReduceInit<Op, T>::value();
-  LoopedElemToLoc<NDIM, (NDIM > 2)> loop(args.reduce_ndim);
-
-  in += elem_to_loc(out_idx, args.shape.data(), args.strides.data(), args.ndim);
-
-  for (size_t n = warp.thread_rank(); n < args.non_row_reductions;
-       n += WARP_SIZE) {
-    for (int r = 0; r < cuda::ceil_div(args.row_size, N_READS); r++) {
-      U vals[N_READS];
-      cub::LoadDirectBlocked(
-          r,
-          make_cast_iterator<U>(in + loop.location()),
-          vals,
-          args.row_size,
-          ReduceInit<Op, T>::value());
-      total_val = op(total_val, cub::ThreadReduce(vals, op));
+  if (size % N == 0) {
+    for (size_t r = 0; r < full_blocks; r++) {
+      for (int k = 0; k < M; k++) {
+        cub::LoadDirectBlockedVectorized<T, N>(
+            block.thread_rank(),
+            in + k * size + r * (block.size() * N),
+            vals[k]);
+        for (int j = 0; j < N; j++) {
+          accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
+        }
+      }
+    }
+  } else {
+    for (size_t r = 0; r < full_blocks; r++) {
+      for (int k = 0; k < M; k++) {
+        cub::LoadDirectBlocked(
+            block.thread_rank(),
+            in + k * size + r * (block.size() * N),
+            vals[k]);
+        for (int j = 0; j < N; j++) {
+          accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
+        }
+      }
    }
-    loop.next(WARP_SIZE, args.reduce_shape.data(), args.reduce_strides.data());
  }

-  total_val = cg::reduce(warp, total_val, op);
+  if (final_offset < size) {
+    for (int k = 0; k < M; k++) {
+      cub::LoadDirectBlocked(
+          block.thread_rank(),
+          in + k * size + final_offset,
+          vals[k],
+          size,
+          __cast<T, U>(init));
+      for (int j = 0; j < N; j++) {
+        accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
+      }
+    }
+  }

-  if (warp.thread_rank() == 0) {
-    out[out_idx] = total_val;
+  __shared__ U shared_accumulators[32 * M];
+  block_reduce(block, warp, accs, shared_accumulators, op, init);
+
+  if (block.thread_rank() == 0) {
+    if (grid.block_rank() * M + M <= n_rows) {
+      for (int i = 0; i < M; i++) {
+        out[i] = accs[i];
+      }
+    } else {
+      short offset = grid.block_rank() * M + M - n_rows;
+      for (int i = offset; i < M; i++) {
+        out[i] = accs[i];
+      }
+    }
  }
 }

@@ -141,55 +167,167 @@ template <
    typename U,
    typename Op,
    int NDIM,
-    int BLOCK_DIM_X,
+    int BLOCK_DIM,
    int N_READS = 4>
 __global__ void row_reduce_looped(
-    const T* in,
+    T* in,
    U* out,
    size_t out_size,
    const __grid_constant__ RowReduceArgs args) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
+  auto warp = cg::tiled_partition<WARP_SIZE>(block);

-  size_t out_idx = grid.thread_rank() / BLOCK_DIM_X;
-  if (out_idx >= out_size) {
-    return;
-  }
+  size_t out_idx = grid.block_rank();

  Op op;

-  U total_val = ReduceInit<Op, T>::value();
+  U total[1];
+  U init = ReduceInit<Op, T>::value();
+  total[0] = init;
  LoopedElemToLoc<NDIM, (NDIM > 2)> loop(args.reduce_ndim);
+  size_t full_blocks = args.row_size / (BLOCK_DIM * N_READS);
+  size_t final_offset = full_blocks * BLOCK_DIM * N_READS;

  in += elem_to_loc(out_idx, args.shape.data(), args.strides.data(), args.ndim);

  for (size_t n = 0; n < args.non_row_reductions; n++) {
-    for (size_t r = 0; r < cuda::ceil_div(args.row_size, BLOCK_DIM_X * N_READS);
-         r++) {
-      U vals[N_READS];
-      cub::LoadDirectBlocked(
-          r * BLOCK_DIM_X + block.thread_index().x,
-          make_cast_iterator<U>(in + loop.location()),
-          vals,
-          args.row_size,
-          ReduceInit<Op, T>::value());
-      total_val = op(total_val, cub::ThreadReduce(vals, op));
+    for (size_t r = 0; r < full_blocks; r++) {
+      T vals[N_READS];
+      cub::LoadDirectBlockedVectorized<T, N_READS>(
+          block.thread_rank(),
+          in + loop.location() + r * BLOCK_DIM * N_READS,
+          vals);
+      for (int i = 0; i < N_READS; i++) {
+        total[0] = op(total[0], __cast<U, T>(vals[i]));
+      }
    }
+    if (final_offset < args.row_size) {
+      T vals[N_READS];
+      cub::LoadDirectBlocked(
+          block.thread_rank(),
+          in + loop.location() + final_offset,
+          vals,
+          args.row_size - final_offset,
+          __cast<T, U>(init));
+      for (int i = 0; i < N_READS; i++) {
+        total[0] = op(total[0], __cast<U, T>(vals[i]));
+      }
+    }
+    // TODO: Maybe block.sync() here?
    loop.next(args.reduce_shape.data(), args.reduce_strides.data());
  }

-  typedef cub::BlockReduce<U, BLOCK_DIM_X> BlockReduceT;
-  __shared__ typename BlockReduceT::TempStorage temp;
-
-  total_val = BlockReduceT(temp).Reduce(total_val, op);
+  __shared__ U shared_accumulators[32];
+  block_reduce(block, warp, total, shared_accumulators, op, init);

  if (block.thread_rank() == 0) {
-    out[out_idx] = total_val;
+    out[out_idx] = total[0];
  }
 }

 } // namespace cu

+void row_reduce_simple(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    array& out,
+    Reduce::ReduceType reduce_type,
+    const std::vector<int>& axes,
+    const ReductionPlan& plan) {
+  constexpr int N_READS = 8;
+
+  // Allocate data for the output using in's layout to avoid elem_to_loc in the
+  // kernel.
+  allocate_same_layout(out, in, axes);
+
+  // TODO: If out.size() < 1024 which will be a common case then write this in
+  //       2 passes. Something like 32 * out.size() and then do a warp reduce.
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+      using OP = MLX_GET_TYPE(reduce_type_tag);
+      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      using U = typename cu::ReduceResult<OP, T>::type;
+
+      // Cub doesn't like const pointers for vectorized loads. (sigh)
+      T* indata = const_cast<T*>(in.data<T>());
+
+      // Calculate the grid and block dims
+      size_t reductions = (plan.shape.back() + N_READS - 1) / N_READS;
+      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
+      int threads = std::min(1024UL, reductions);
+      threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+      dim3 block(threads, 1, 1);
+
+      // Pick the kernel
+      auto kernel = cu::row_reduce_simple<T, U, OP, N_READS>;
+      if (grid.x >= 1024) {
+        grid.x = (grid.x + 1) / 2;
+        kernel = cu::row_reduce_simple<T, U, OP, N_READS, 2>;
+      }
+
+      int size = plan.shape.back();
+      encoder.add_kernel_node(
+          kernel, grid, block, indata, out.data<U>(), out.size(), size);
+    });
+  });
+}
+
+void row_reduce_looped(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    array& out,
+    Reduce::ReduceType reduce_type,
+    const std::vector<int>& axes,
+    const ReductionPlan& plan,
+    cu::RowReduceArgs args) {
+  constexpr int N_READS = 8;
+
+  // Allocate data for the output using in's layout to access them as
+  // contiguously as possible.
+  allocate_same_layout(out, in, axes);
+
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+      using OP = MLX_GET_TYPE(reduce_type_tag);
+      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      using U = typename cu::ReduceResult<OP, T>::type;
+      // Cub doesn't like const pointers for vectorized loads. (sigh)
+      T* indata = const_cast<T*>(in.data<T>());
+
+      // Calculate the grid and block dims
+      args.sort_access_pattern(in, axes);
+      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
+      size_t reductions = (args.row_size + N_READS - 1) / N_READS;
+      int threads = std::min(1024UL, reductions);
+      threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+      dim3 block(threads, 1, 1);
+
+      // Pick the kernel
+      auto kernel = cu::row_reduce_looped<T, U, OP, 1, 32, N_READS>;
+      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
+        dispatch_block_dim(threads, [&](auto threads_constant) {
+          kernel = cu::row_reduce_looped<
+              T,
+              U,
+              OP,
+              reduce_ndim.value,
+              threads_constant.value,
+              N_READS>;
+          block.x = threads_constant.value;
+        });
+      });
+
+      encoder.add_kernel_node(
+          kernel, grid, block, indata, out.data<U>(), out.size(), args);
+    });
+  });
+}
+
 void row_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
@@ -197,54 +335,35 @@ void row_reduce(
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan) {
+  // Current row reduction options
+  //
+  // - row_reduce_simple
+  //
+  //   That means that we are simply reducing across the fastest moving axis.
+  //   We are reducing 1 or 2 rows per threadblock depending on the size of
+  //   output.
+  //
+  // - row_reduce_looped
+  //
+  //   It is a general row reduction. We are computing 1 output per
+  //   threadblock. We read the fastest moving axis vectorized and loop over
+  //   the rest of the axes.
+  //
+  // Notes: We opt to read as much in order as possible and leave
+  //        transpositions as they are (contrary to our Metal backend).
+
+  // Simple row reduce means that we have 1 axis that we are reducing over and
+  // it has stride 1.
+  if (plan.shape.size() == 1) {
+    row_reduce_simple(encoder, in, out, reduce_type, axes, plan);
+    return;
+  }
+
+  // Make the args struct to help route to the best kernel
  cu::RowReduceArgs args(in, plan, axes);

-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE, {
-      using InType = cuda_type_t<CTYPE>;
-      MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
-        using OutType = cu::ReduceResult<OP, InType>::type;
-        MLX_SWITCH_REDUCE_NDIM(args.reduce_ndim, NDIM, {
-          constexpr size_t N_READS = 4;
-          dim3 out_dims = get_2d_grid_dims(out.shape(), out.strides());
-          dim3 block_dims, num_blocks;
-          auto kernel =
-              cu::row_reduce_small<InType, OutType, OP, NDIM, N_READS>;
-          if (args.row_size <= 64) {
-            if ((args.non_row_reductions < 32 && args.row_size <= 8) ||
-                (args.non_row_reductions <= 8)) {
-              block_dims.x = std::min(out_dims.x, 1024u);
-              num_blocks.x = cuda::ceil_div(out_dims.x, block_dims.x);
-              num_blocks.y = out_dims.y;
-            } else {
-              block_dims.x = WARP_SIZE;
-              num_blocks.y = out_dims.x;
-              num_blocks.z = out_dims.y;
-              kernel =
-                  cu::row_reduce_small_warp<InType, OutType, OP, NDIM, N_READS>;
-            }
-          } else {
-            size_t num_threads = cuda::ceil_div(args.row_size, N_READS);
-            num_threads = cuda::ceil_div(num_threads, WARP_SIZE) * WARP_SIZE;
-            MLX_SWITCH_BLOCK_DIM(num_threads, BLOCK_DIM_X, {
-              num_blocks.y = out_dims.x;
-              num_blocks.z = out_dims.y;
-              block_dims.x = BLOCK_DIM_X;
-              kernel = cu::row_reduce_looped<
-                  InType,
-                  OutType,
-                  OP,
-                  NDIM,
-                  BLOCK_DIM_X,
-                  N_READS>;
-            });
-          }
-          kernel<<<num_blocks, block_dims, 0, stream>>>(
-              in.data<InType>(), out.data<OutType>(), out.size(), args);
-        });
-      });
-    });
-  });
+  // Fallback row reduce
+  row_reduce_looped(encoder, in, out, reduce_type, axes, plan, std::move(args));
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/reduce/segmented_reduce.cu
+++ b/mlx/backend/cuda/reduce/segmented_reduce.cu
@@ -1,84 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/cast_op.cuh"
-#include "mlx/backend/cuda/reduce/reduce.cuh"
-
-#include <thrust/device_ptr.h>
-#include <cub/device/device_reduce.cuh>
-#include <cub/device/device_segmented_reduce.cuh>
-
-namespace mlx::core {
-
-template <typename... Args>
-void cub_all_reduce(cu::CommandEncoder& encoder, Args&&... args) {
-  // Allocate temporary storage.
-  size_t size;
-  CHECK_CUDA_ERROR(cub::DeviceReduce::Reduce(nullptr, size, args...));
-  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
-  encoder.add_temporary(temp);
-  // Run op.
-  CHECK_CUDA_ERROR(cub::DeviceReduce::Reduce(temp.data<void>(), size, args...));
-}
-
-template <typename... Args>
-void cub_segmented_reduce(cu::CommandEncoder& encoder, Args&&... args) {
-  // Allocate temporary storage.
-  size_t size;
-  CHECK_CUDA_ERROR(cub::DeviceSegmentedReduce::Reduce(nullptr, size, args...));
-  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
-  encoder.add_temporary(temp);
-  // Run op.
-  CHECK_CUDA_ERROR(
-      cub::DeviceSegmentedReduce::Reduce(temp.data<void>(), size, args...));
-}
-
-struct MultiplyOp {
-  int factor;
-  __device__ int operator()(int i) {
-    return i * factor;
-  }
-};
-
-void segmented_reduce(
-    cu::CommandEncoder& encoder,
-    const array& in,
-    array& out,
-    Reduce::ReduceType reduce_type,
-    const std::vector<int>& axes,
-    const ReductionPlan& plan) {
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE, {
-      MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
-        using InType = cuda_type_t<CTYPE>;
-        using OutType = cu::ReduceResult<OP, InType>::type;
-        auto in_iter = cu::make_cast_iterator<OutType>(
-            thrust::device_pointer_cast(in.data<InType>()));
-        auto out_ptr = thrust::device_pointer_cast(out.data<OutType>());
-        auto init = cu::ReduceInit<OP, InType>::value();
-
-        if (plan.type == ContiguousAllReduce) {
-          cub_all_reduce(
-              encoder, in_iter, out_ptr, in.data_size(), OP(), init, stream);
-        } else if (plan.type == ContiguousReduce) {
-          auto offsets = thrust::make_transform_iterator(
-              thrust::make_counting_iterator(0), MultiplyOp{plan.shape.back()});
-          cub_segmented_reduce(
-              encoder,
-              in_iter,
-              out_ptr,
-              out.size(),
-              offsets,
-              offsets + 1,
-              OP(),
-              init,
-              stream);
-        } else {
-          throw std::runtime_error("Unsupported plan in segmented_reduce.");
-        }
-      });
-    });
-  });
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/rms_norm.cu
+++ b/mlx/backend/cuda/rms_norm.cu
@@ -224,20 +224,21 @@ void RMSNorm::eval_gpu(
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_FLOAT_TYPES_CHECKED(out.dtype(), "rms_norm", CTYPE, {
-      using DataType = cuda_type_t<CTYPE>;
-      constexpr uint32_t N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        auto kernel = cu::rms_norm<DataType, BLOCK_DIM, N_READS>;
-        kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-            x.data<DataType>(),
-            w.data<DataType>(),
-            out.data<DataType>(),
-            eps_,
-            axis_size,
-            w_stride);
-      });
+  dispatch_float_types(out.dtype(), "rms_norm", [&](auto type_tag) {
+    constexpr uint32_t N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      auto kernel = cu::rms_norm<DataType, block_dim(), N_READS>;
+      encoder.add_kernel_node(
+          kernel,
+          n_rows,
+          block_dim(),
+          x.data<DataType>(),
+          w.data<DataType>(),
+          out.data<DataType>(),
+          eps_,
+          axis_size,
+          w_stride);
    });
  });
 }
@@ -252,20 +253,24 @@ void RMSNormVJP::eval_gpu(
  // Ensure row contiguity. We could relax this step by checking that the array
  // is contiguous (no broadcasts or holes) and that the input strides are the
  // same as the cotangent strides but for now this is simpler.
-  auto check_input = [&s](const array& x) -> std::pair<array, bool> {
+  auto check_input = [&s](const array& x, bool& copied) {
    if (x.flags().row_contiguous) {
-      return {x, false};
+      copied = false;
+      return x;
    }
+    copied = true;
    array x_copy(x.shape(), x.dtype(), nullptr, {});
    copy_gpu(x, x_copy, CopyType::General, s);
-    return {x_copy, true};
+    return x_copy;
  };
  bool donate_x = inputs[0].is_donatable();
  bool donate_g = inputs[2].is_donatable();
-  auto [x, copied] = check_input(inputs[0]);
+  bool copied;
+  auto x = check_input(inputs[0], copied);
  donate_x |= copied;
  const array& w = inputs[1];
-  auto [g, g_copied] = check_input(inputs[2]);
+  bool g_copied;
+  auto g = check_input(inputs[2], g_copied);
  donate_g |= g_copied;
  array& gx = outputs[0];
  array& gw = outputs[1];
@@ -303,31 +308,37 @@ void RMSNormVJP::eval_gpu(
      encoder.add_temporary(gw_temp);
    }
  }
-  gw.set_data(allocator::malloc(gw.nbytes()));

  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(g);
  encoder.set_output_array(gx);
  encoder.set_output_array(gw_temp);
-  encoder.launch_kernel([&, x = x, g = g](cudaStream_t stream) {
-    MLX_SWITCH_FLOAT_TYPES_CHECKED(gx.dtype(), "rms_norm_vjp", CTYPE, {
-      using DataType = cuda_type_t<CTYPE>;
+  dispatch_float_types(gx.dtype(), "rms_norm_vjp", [&](auto type_tag) {
+    dispatch_bool(has_w, [&](auto has_w_constant) {
      constexpr int N_READS = 4;
-      MLX_SWITCH_BOOL(has_w, HAS_W, {
-        MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-          auto kernel = cu::rms_norm_vjp<DataType, HAS_W, BLOCK_DIM, N_READS>;
-          kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-              x.data<DataType>(),
-              w.data<DataType>(),
-              g.data<DataType>(),
-              gx.data<DataType>(),
-              gw_temp.data<DataType>(),
-              eps_,
-              axis_size,
-              w_stride);
-        });
-      });
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            constexpr int N_READS = 4;
+            auto kernel = cu::rms_norm_vjp<
+                DataType,
+                has_w_constant.value,
+                block_dim(),
+                N_READS>;
+            encoder.add_kernel_node(
+                kernel,
+                n_rows,
+                block_dim(),
+                x.data<DataType>(),
+                w.data<DataType>(),
+                g.data<DataType>(),
+                gx.data<DataType>(),
+                gw_temp.data<DataType>(),
+                eps_,
+                axis_size,
+                w_stride);
+          });
    });
  });

--- a/mlx/backend/cuda/rope.cu
+++ b/mlx/backend/cuda/rope.cu
@@ -308,73 +308,89 @@ void RoPE::eval_gpu(
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(donated ? out : in);
  encoder.set_input_array(offset);
+  if (with_freqs) {
+    encoder.set_input_array(inputs[2]);
+  }
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_FLOAT_TYPES_CHECKED(in.dtype(), "rope", CTYPE, {
-      using DataType = cuda_type_t<CTYPE>;
-      MLX_SWITCH_BOOL(traditional_, TRADITIONAL, {
-        MLX_SWITCH_BOOL(forward_, FORWARD, {
-          if (single && !with_freqs) {
-            auto kernel = cu::rope_single<DataType, TRADITIONAL, FORWARD>;
-            uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
-            auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
-            kernel<<<grid, block, 0, stream>>>(
-                (donated ? out : in).data<DataType>(),
-                out.data<DataType>(),
-                offset.data<int32_t>(),
-                scale_,
-                std::log2(base_),
-                mat_size,
-                dims);
-          } else if (single) {
-            auto kernel = cu::rope_single_freqs<DataType, TRADITIONAL, FORWARD>;
-            uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
-            auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
-            kernel<<<grid, block, 0, stream>>>(
-                (donated ? out : in).data<DataType>(),
-                out.data<DataType>(),
-                offset.data<int32_t>(),
-                inputs[2].data<float>(),
-                scale_,
-                mat_size,
-                dims,
-                inputs[2].strides(0));
-          } else if (with_freqs) {
-            auto kernel = cu::rope_freqs<DataType, TRADITIONAL, FORWARD>;
-            uint3 dims =
-                make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
-            dims.z = (dims.z + 3) / 4;
-            auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
-            kernel<<<grid, block, 0, stream>>>(
-                (donated ? out : in).data<DataType>(),
-                out.data<DataType>(),
-                offset.data<int32_t>(),
-                inputs[2].data<float>(),
-                scale_,
-                std::log2(base_),
-                strides,
-                out_strides,
-                in.size() / mat_size,
-                dims,
-                inputs[2].strides(0));
-          } else {
-            auto kernel = cu::rope<DataType, TRADITIONAL, FORWARD>;
-            uint3 dims =
-                make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
-            dims.z = (dims.z + 3) / 4;
-            auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
-            kernel<<<grid, block, 0, stream>>>(
-                (donated ? out : in).data<DataType>(),
-                out.data<DataType>(),
-                offset.data<int32_t>(),
-                scale_,
-                std::log2(base_),
-                strides,
-                out_strides,
-                in.size() / mat_size,
-                dims);
-          }
-        });
+  dispatch_float_types(out.dtype(), "rope", [&](auto type_tag) {
+    dispatch_bool(traditional_, [&](auto traditional) {
+      dispatch_bool(forward_, [&](auto forward) {
+        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        if (single && !with_freqs) {
+          auto kernel =
+              cu::rope_single<DataType, traditional.value, forward.value>;
+          uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
+          auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
+          encoder.add_kernel_node(
+              kernel,
+              grid,
+              block,
+              (donated ? out : in).data<DataType>(),
+              out.data<DataType>(),
+              offset.data<int32_t>(),
+              scale_,
+              std::log2(base_),
+              mat_size,
+              dims);
+        } else if (single) {
+          auto kernel =
+              cu::rope_single_freqs<DataType, traditional.value, forward.value>;
+          uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
+          auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
+          encoder.add_kernel_node(
+              kernel,
+              grid,
+              block,
+              (donated ? out : in).data<DataType>(),
+              out.data<DataType>(),
+              offset.data<int32_t>(),
+              inputs[2].data<float>(),
+              scale_,
+              mat_size,
+              dims,
+              inputs[2].strides(0));
+        } else if (with_freqs) {
+          auto kernel =
+              cu::rope_freqs<DataType, traditional.value, forward.value>;
+          uint3 dims =
+              make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
+          dims.z = (dims.z + 3) / 4;
+          auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
+          encoder.add_kernel_node(
+              kernel,
+              grid,
+              block,
+              (donated ? out : in).data<DataType>(),
+              out.data<DataType>(),
+              offset.data<int32_t>(),
+              inputs[2].data<float>(),
+              scale_,
+              std::log2(base_),
+              strides,
+              out_strides,
+              in.size() / mat_size,
+              dims,
+              inputs[2].strides(0));
+        } else {
+          auto kernel = cu::rope<DataType, traditional.value, forward.value>;
+          uint3 dims =
+              make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
+          dims.z = (dims.z + 3) / 4;
+          auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
+          encoder.add_kernel_node(
+              kernel,
+              grid,
+              block,
+              (donated ? out : in).data<DataType>(),
+              out.data<DataType>(),
+              offset.data<int32_t>(),
+              scale_,
+              std::log2(base_),
+              strides,
+              out_strides,
+              in.size() / mat_size,
+              dims);
+        }
      });
    });
  });
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -51,7 +51,7 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
        make_cast_iterator<AccT>(in),
        vals,
        axis_size,
-        Limits<AccT>::finite_min());
+        Limits<AccT>::min());
    prevmax = maxval;
    maxval = max_op(maxval, cub::ThreadReduce(vals, max_op));
    // Online normalizer calculation for softmax:
@@ -79,7 +79,7 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
  block.sync();
  maxval = warp.thread_rank() < warp.meta_group_size()
      ? local_max[warp.thread_rank()]
-      : Limits<AccT>::finite_min();
+      : Limits<AccT>::min();
  maxval = cg::reduce(warp, maxval, max_op);
  normalizer = normalizer * softmax_exp(prevmax - maxval);
  if (warp.thread_rank() == 0) {
@@ -141,18 +141,21 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_FLOAT_TYPES_CHECKED(out.dtype(), "softmax", CTYPE, {
-      using DataType = cuda_type_t<CTYPE>;
-      constexpr int N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        auto kernel = cu::softmax<DataType, DataType, BLOCK_DIM, N_READS>;
-        if (precise) {
-          kernel = cu::softmax<DataType, float, BLOCK_DIM, N_READS>;
-        }
-        kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-            in.data<DataType>(), out.data<DataType>(), axis_size);
-      });
+  dispatch_float_types(out.dtype(), "softmax", [&](auto type_tag) {
+    constexpr int N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      auto kernel = cu::softmax<DataType, DataType, block_dim(), N_READS>;
+      if (precise) {
+        kernel = cu::softmax<DataType, float, block_dim(), N_READS>;
+      }
+      encoder.add_kernel_node(
+          kernel,
+          n_rows,
+          block_dim(),
+          in.data<DataType>(),
+          out.data<DataType>(),
+          axis_size);
    });
  });
 }
--- a/mlx/backend/cuda/sort.cu
+++ b/mlx/backend/cuda/sort.cu
@@ -50,43 +50,21 @@ array swapaxes_in_eval(const array& in, int axis1, int axis2) {
  return out;
 }

-template <typename... Args>
-void segmented_sort_pairs(cu::CommandEncoder& encoder, Args&&... args) {
-  // Allocate temporary storage.
-  size_t size;
-  CHECK_CUDA_ERROR(
-      cub::DeviceSegmentedSort::StableSortPairs(nullptr, size, args...));
-  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
-  encoder.add_temporary(temp);
-  // Run op.
-  CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
-      temp.data<void>(), size, args...));
-}
+struct OffsetTransform {
+  int nsort;

-template <typename... Args>
-void segmented_sort(cu::CommandEncoder& encoder, Args&&... args) {
-  // Allocate temporary storage.
-  size_t size;
-  CHECK_CUDA_ERROR(
-      cub::DeviceSegmentedSort::StableSortKeys(nullptr, size, args...));
-  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
-  encoder.add_temporary(temp);
-  // Run op.
-  CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
-      temp.data<void>(), size, args...));
-}
+  int __device__ operator()(int i) {
+    return i * nsort;
+  }
+};

 void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
  array out = out_;
  auto& encoder = cu::get_command_encoder(s);
-  encoder.set_input_array(in);
-  encoder.set_output_array(out);
-
  if (axis < 0) {
    axis += in.ndim();
  }
  int nsort = in.shape(axis);
-  int nsegments = in.data_size() / nsort;
  int last_dim = in.ndim() - 1;

  // If we are not sorting the innermost dimension of a contiguous array,
@@ -100,60 +78,103 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
    out = array(allocator::malloc(out.nbytes()), in.shape(), out.dtype());
    encoder.add_temporary(out);
  } else {
-    out.set_data(allocator::malloc(out.nbytes()));
+    out.set_data(
+        allocator::malloc(in.data_size() * out.itemsize()),
+        in.data_size(),
+        in.strides(),
+        in.flags());
  }

-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE, {
-      if constexpr (!std::is_same_v<CTYPE, complex64_t>) {
-        using Type = cuda_type_t<CTYPE>;
-        auto offsets = thrust::make_transform_iterator(
-            thrust::make_counting_iterator(0),
-            [nsort] __device__(int i) { return i * nsort; });
-        if (argsort) {
-          // Indices in the sorted dimension.
-          array indices(
-              allocator::malloc(out.nbytes()), in.shape(), out.dtype());
-          encoder.add_temporary(indices);
-          thrust::transform(
-              cu::thrust_policy(stream),
-              thrust::counting_iterator<uint32_t>(0),
-              thrust::counting_iterator<uint32_t>(indices.data_size()),
-              thrust::device_pointer_cast(indices.data<uint32_t>()),
-              ModOp<uint32_t>{static_cast<uint32_t>(nsort)});
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    using CTYPE = MLX_GET_TYPE(type_tag);
+    auto& stream = encoder.stream();
+    if constexpr (!std::is_same_v<CTYPE, complex64_t>) {
+      using Type = cuda_type_t<CTYPE>;
+      auto offsets = thrust::make_transform_iterator(
+          thrust::make_counting_iterator(0), OffsetTransform{nsort});
+      if (argsort) {
+        // Indices in the sorted dimension.
+        array indices(allocator::malloc(out.nbytes()), in.shape(), out.dtype());
+        encoder.add_temporary(indices);

-          // In argsort though we don't need the result of sorted values, the
-          // API requires us to provide an array to store it.
-          array discard(allocator::malloc(in.nbytes()), in.shape(), in.dtype());
-          encoder.add_temporary(discard);
+        // In argsort though we don't need the result of sorted values, the
+        // API requires us to provide an array to store it.
+        array discard(allocator::malloc(in.nbytes()), in.shape(), in.dtype());
+        encoder.add_temporary(discard);

-          segmented_sort_pairs(
-              encoder,
-              in.data<Type>(),
-              discard.data<Type>(),
-              indices.data<uint32_t>(),
-              out.data<uint32_t>(),
-              in.data_size(),
-              nsegments,
-              offsets,
-              offsets + 1,
-              stream);
-        } else {
-          segmented_sort(
-              encoder,
-              in.data<Type>(),
-              out.data<Type>(),
-              in.data_size(),
-              nsegments,
-              offsets,
-              offsets + 1,
-              stream);
-        }
+        size_t size;
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
+            nullptr,
+            size,
+            in.data<Type>(),
+            discard.data<Type>(),
+            indices.data<uint32_t>(),
+            out.data<uint32_t>(),
+            in.data_size(),
+            in.data_size() / nsort,
+            offsets,
+            offsets + 1,
+            stream));
+
+        array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
+        encoder.add_temporary(temp);
+
+        // Start capturing after allocations
+        auto capture = encoder.capture_context();
+        thrust::transform(
+            cu::thrust_policy(stream),
+            thrust::counting_iterator<uint32_t>(0),
+            thrust::counting_iterator<uint32_t>(indices.data_size()),
+            thrust::device_pointer_cast(indices.data<uint32_t>()),
+            ModOp<uint32_t>{static_cast<uint32_t>(nsort)});
+
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
+            temp.data<void>(),
+            size,
+            in.data<Type>(),
+            discard.data<Type>(),
+            indices.data<uint32_t>(),
+            out.data<uint32_t>(),
+            in.data_size(),
+            in.data_size() / nsort,
+            offsets,
+            offsets + 1,
+            stream));
      } else {
-        throw std::runtime_error(
-            "CUDA backend does not support sorting complex numbers");
+        size_t size;
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
+            nullptr,
+            size,
+            in.data<Type>(),
+            out.data<Type>(),
+            in.data_size(),
+            in.data_size() / nsort,
+            offsets,
+            offsets + 1,
+            stream));
+
+        array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
+        encoder.add_temporary(temp);
+
+        // Start capturing after allocations
+        auto capture = encoder.capture_context();
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
+            temp.data<void>(),
+            size,
+            in.data<Type>(),
+            out.data<Type>(),
+            in.data_size(),
+            in.data_size() / nsort,
+            offsets,
+            offsets + 1,
+            stream));
      }
-    });
+    } else {
+      throw std::runtime_error(
+          "CUDA backend does not support sorting complex numbers");
+    }
  });

  if (!is_segmented_sort) {
@@ -177,4 +198,14 @@ void Sort::eval_gpu(const std::vector<array>& inputs, array& out) {
  gpu_sort(stream(), inputs[0], out, axis_, false);
 }

+void ArgPartition::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("ArgPartition::eval_gpu");
+  gpu_sort(stream(), inputs[0], out, axis_, true);
+}
+
+void Partition::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Partition::eval_gpu");
+  gpu_sort(stream(), inputs[0], out, axis_, false);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -91,68 +91,80 @@ void ternary_op_gpu_inplace(
  encoder.set_input_array(b);
  encoder.set_input_array(c);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE, {
-      using DType = cuda_type_t<CTYPE>;
+  dispatch_all_types(out.dtype(), [&](auto type_tag) {
+    using DType = cuda_type_t<MLX_GET_TYPE(type_tag)>;

-      auto topt = get_ternary_op_type(a, b, c);
-      if (topt == TernaryOpType::General) {
-        auto [shape, strides] = collapse_contiguous_dims(a, b, c, out);
-        auto& a_strides = strides[0];
-        auto& b_strides = strides[1];
-        auto& c_strides = strides[2];
-        bool large = a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
-            c.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-        MLX_SWITCH_BOOL(large, LARGE, {
-          using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-          int ndim = shape.size();
-          if (ndim <= 3) {
-            MLX_SWITCH_1_2_3(ndim, NDIM, {
-              auto kernel = cu::ternary_g_nd<Op, DType, IdxT, NDIM>;
+    auto topt = get_ternary_op_type(a, b, c);
+    if (topt == TernaryOpType::General) {
+      dispatch_bool(
+          a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+              c.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            Shape shape;
+            std::vector<Strides> strides;
+            std::tie(shape, strides) = collapse_contiguous_dims(a, b, c, out);
+            auto& a_strides = strides[0];
+            auto& b_strides = strides[1];
+            auto& c_strides = strides[2];
+            int ndim = shape.size();
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                auto kernel =
+                    cu::ternary_g_nd<Op, DType, IdxT, dims_constant()>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out, large());
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
+                    a.data<bool>(),
+                    b.data<DType>(),
+                    c.data<DType>(),
+                    out.data<DType>(),
+                    out.size(),
+                    const_param<dims_constant()>(shape),
+                    const_param<dims_constant()>(a_strides),
+                    const_param<dims_constant()>(b_strides),
+                    const_param<dims_constant()>(c_strides));
+              });
+            } else {
+              auto kernel = cu::ternary_g<Op, DType, IdxT>;
              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large);
-              kernel<<<num_blocks, block_dims, 0, stream>>>(
+                  get_launch_args(kernel, out, large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
                  a.data<bool>(),
                  b.data<DType>(),
                  c.data<DType>(),
                  out.data<DType>(),
-                  out.size(),
-                  const_param<NDIM>(shape),
-                  const_param<NDIM>(a_strides),
-                  const_param<NDIM>(b_strides),
-                  const_param<NDIM>(c_strides));
-            });
-          } else {
-            auto kernel = cu::ternary_g<Op, DType, IdxT>;
-            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-            kernel<<<num_blocks, block_dims, 0, stream>>>(
-                a.data<bool>(),
-                b.data<DType>(),
-                c.data<DType>(),
-                out.data<DType>(),
-                out.data_size(),
-                const_param(shape),
-                const_param(a_strides),
-                const_param(b_strides),
-                const_param(c_strides),
-                ndim);
-          }
-        });
-      } else {
-        MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
-          using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
-          auto kernel = cu::ternary_v<Op, DType, IdxT>;
-          auto [num_blocks, block_dims] = get_launch_args(
-              kernel, out.data_size(), out.shape(), out.strides(), LARGE);
-          kernel<<<num_blocks, block_dims, 0, stream>>>(
-              a.data<bool>(),
-              b.data<DType>(),
-              c.data<DType>(),
-              out.data<DType>(),
-              out.data_size());
-        });
-      }
-    });
+                  out.data_size(),
+                  const_param(shape),
+                  const_param(a_strides),
+                  const_param(b_strides),
+                  const_param(c_strides),
+                  ndim);
+            }
+          });
+    } else {
+      dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+        auto kernel = cu::ternary_v<Op, DType, IdxT>;
+        auto [num_blocks, block_dims] = get_launch_args(
+            kernel, out.data_size(), out.shape(), out.strides(), large());
+        encoder.add_kernel_node(
+            kernel,
+            num_blocks,
+            block_dims,
+            a.data<bool>(),
+            b.data<DType>(),
+            c.data<DType>(),
+            out.data<DType>(),
+            out.data_size());
+      });
+    }
  });
 }

--- a/mlx/backend/cuda/unary.cu
+++ b/mlx/backend/cuda/unary.cu
@@ -9,49 +9,70 @@
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"

+#include <cooperative_groups.h>
 #include <nvtx3/nvtx3.hpp>
-#include <thrust/device_ptr.h>
-#include <thrust/transform.h>

 namespace mlx::core {

 namespace cu {

+namespace cg = cooperative_groups;
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void unary_v(const In* in, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    out[index] = Op{}(in[index]);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void unary_g(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto idx = elem_to_loc_4d(index, shape.data(), strides.data(), ndim);
+    out[index] = Op{}(in[idx]);
+  }
+}
+
 template <typename Op, typename In, typename Out>
 constexpr bool supports_unary_op() {
  if (std::is_same_v<Op, Abs> || std::is_same_v<Op, Negative> ||
-      std::is_same_v<Op, Sign>) {
+      std::is_same_v<Op, Sign> || std::is_same_v<Op, Square>) {
    return std::is_same_v<In, Out>;
  }
-  if (std::is_same_v<Op, ArcCos> || std::is_same_v<Op, ArcCosh> ||
-      std::is_same_v<Op, ArcSin> || std::is_same_v<Op, ArcSinh> ||
-      std::is_same_v<Op, ArcTan> || std::is_same_v<Op, ArcTanh> ||
-      std::is_same_v<Op, Erf> || std::is_same_v<Op, ErfInv> ||
-      std::is_same_v<Op, Expm1> || std::is_same_v<Op, Sigmoid> ||
-      std::is_same_v<Op, Sqrt> || std::is_same_v<Op, Rsqrt>) {
+  if (std::is_same_v<Op, ArcCosh> || std::is_same_v<Op, ArcSinh> ||
+      std::is_same_v<Op, ArcTanh> || std::is_same_v<Op, Erf> ||
+      std::is_same_v<Op, ErfInv> || std::is_same_v<Op, Expm1> ||
+      std::is_same_v<Op, Sigmoid>) {
    return std::is_same_v<In, Out> && is_floating_v<In>;
  }
-  if (std::is_same_v<Op, Log> || std::is_same_v<Op, Log2> ||
-      std::is_same_v<Op, Log10> || std::is_same_v<Op, Log1p>) {
-    return std::is_same_v<In, Out> && is_inexact_v<In>;
-  }
  if (std::is_same_v<Op, BitwiseInvert>) {
    return std::is_same_v<In, Out> && std::is_integral_v<In> &&
        !std::is_same_v<In, bool>;
  }
-  if (std::is_same_v<Op, Ceil> || std::is_same_v<Op, Floor> ||
-      std::is_same_v<Op, Square>) {
+  if (std::is_same_v<Op, Ceil> || std::is_same_v<Op, Floor>) {
    return std::is_same_v<In, Out> && !std::is_same_v<In, complex64_t>;
  }
  if (std::is_same_v<Op, Conjugate>) {
    return std::is_same_v<In, Out> && std::is_same_v<In, complex64_t>;
  }
-  if (std::is_same_v<Op, Cos> || std::is_same_v<Op, Cosh> ||
-      std::is_same_v<Op, Exp> || std::is_same_v<Op, Round> ||
-      std::is_same_v<Op, Sin> || std::is_same_v<Op, Sinh> ||
-      std::is_same_v<Op, Tan> || std::is_same_v<Op, Tanh>) {
-    return std::is_same_v<In, Out> &&
-        (is_floating_v<In> || std::is_same_v<In, complex64_t>);
+  if (std::is_same_v<Op, ArcCos> || std::is_same_v<Op, ArcSin> ||
+      std::is_same_v<Op, ArcTan> || std::is_same_v<Op, Cos> ||
+      std::is_same_v<Op, Cosh> || std::is_same_v<Op, Exp> ||
+      std::is_same_v<Op, Log> || std::is_same_v<Op, Log2> ||
+      std::is_same_v<Op, Log10> || std::is_same_v<Op, Log1p> ||
+      std::is_same_v<Op, Round> || std::is_same_v<Op, Rsqrt> ||
+      std::is_same_v<Op, Sqrt> || std::is_same_v<Op, Sin> ||
+      std::is_same_v<Op, Sinh> || std::is_same_v<Op, Tan> ||
+      std::is_same_v<Op, Tanh>) {
+    return std::is_same_v<In, Out> && is_inexact_v<In>;
  }
  if (std::is_same_v<Op, Imag> || std::is_same_v<Op, Real>) {
    return std::is_same_v<In, complex64_t> && std::is_same_v<Out, float>;
@@ -74,36 +95,61 @@ void unary_op_gpu_inplace(
  if (in.size() == 0) {
    return;
  }
+  bool contig = in.flags().contiguous;
+  bool large;
+  if (!contig) {
+    large = in.data_size() > INT32_MAX || out.size() > INT32_MAX;
+  } else {
+    large = in.data_size() > UINT32_MAX;
+  }

  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE_IN, {
-      MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {
-        if constexpr (cu::supports_unary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+      if constexpr (cu::supports_unary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+        dispatch_bool(large, [&](auto large) {
+          using IdxT = std::conditional_t<large(), int64_t, int32_t>;
          using InType = cuda_type_t<CTYPE_IN>;
          using OutType = cuda_type_t<CTYPE_OUT>;
-          auto policy = cu::thrust_policy(stream);
-          auto in_ptr = thrust::device_pointer_cast(in.data<InType>());
-          auto out_ptr = thrust::device_pointer_cast(out.data<OutType>());
-          if (in.flags().contiguous) {
-            thrust::transform(
-                policy, in_ptr, in_ptr + in.data_size(), out_ptr, Op());
+          using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+          if (contig) {
+            auto kernel = cu::unary_v<Op, InType, OutType, IdxT>;
+            auto [num_blocks, block_dims] = get_launch_args(
+                kernel, out.data_size(), out.shape(), out.strides(), large);
+            encoder.add_kernel_node(
+                kernel,
+                num_blocks,
+                block_dims,
+                in.data<InType>(),
+                out.data<OutType>(),
+                out.data_size());
          } else {
            auto [shape, strides] = collapse_contiguous_dims(in);
-            auto [in_begin, in_end] = cu::make_general_iterators<int64_t>(
-                in_ptr, in.size(), shape, strides);
-            thrust::transform(policy, in_begin, in_end, out_ptr, Op());
+            auto kernel = cu::unary_g<Op, InType, OutType, IdxT>;
+            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+            encoder.add_kernel_node(
+                kernel,
+                num_blocks,
+                block_dims,
+                in.data<InType>(),
+                out.data<OutType>(),
+                out.data_size(),
+                const_param(shape),
+                const_param(strides),
+                shape.size());
          }
-        } else {
-          throw std::runtime_error(fmt::format(
-              "Can not do unary op {} on input of {} with output of {}.",
-              op,
-              dtype_to_string(in.dtype()),
-              dtype_to_string(out.dtype())));
-        }
-      });
+        });
+      } else {
+        throw std::runtime_error(fmt::format(
+            "Can not do unary op {} on input of {} with output of {}.",
+            op,
+            dtype_to_string(in.dtype()),
+            dtype_to_string(out.dtype())));
+      }
    });
  });
 }
--- a/mlx/backend/cuda/utils.cpp
+++ b/mlx/backend/cuda/utils.cpp
@@ -24,23 +24,47 @@ void check_cuda_error(const char* name, cudaError_t err) {
  }
 }

+void check_cuda_error(const char* name, CUresult err) {
+  if (err != CUDA_SUCCESS) {
+    const char* err_str = "Unknown error";
+    cuGetErrorString(err, &err_str);
+    throw std::runtime_error(fmt::format("{} failed: {}", name, err_str));
+  }
+}
+
 const char* dtype_to_cuda_type(const Dtype& dtype) {
-  if (dtype == float16) {
-    return "__half";
+  switch (dtype) {
+    case bool_:
+      return "bool";
+    case int8:
+      return "int8_t";
+    case int16:
+      return "int16_t";
+    case int32:
+      return "int32_t";
+    case int64:
+      return "int64_t";
+    case uint8:
+      return "uint8_t";
+    case uint16:
+      return "uint16_t";
+    case uint32:
+      return "uint32_t";
+    case uint64:
+      return "uint64_t";
+    case float16:
+      return "__half";
+    case bfloat16:
+      return "__nv_bfloat16";
+    case float32:
+      return "float";
+    case float64:
+      return "double";
+    case complex64:
+      return "cuComplex";
+    default:
+      return "unknown";
  }
-  if (dtype == bfloat16) {
-    return "__nv_bfloat16";
-  }
-  if (dtype == complex64) {
-    return "cuComplex";
-  }
-#define SPECIALIZE_DtypeToString(CPP_TYPE, DTYPE) \
-  if (dtype == DTYPE) {                           \
-    return #CPP_TYPE;                             \
-  }
-  MLX_FORALL_DTYPES(SPECIALIZE_DtypeToString)
-#undef SPECIALIZE_DtypeToString
-  return nullptr;
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/utils.h
+++ b/mlx/backend/cuda/utils.h
@@ -4,6 +4,7 @@

 #pragma once

+#include <cuda.h>
 #include <cuda_runtime.h>

 namespace mlx::core {
@@ -33,6 +34,7 @@ class CudaStream {

 // Throw exception if the cuda API does not succeed.
 void check_cuda_error(const char* name, cudaError_t err);
+void check_cuda_error(const char* name, CUresult err);

 // The macro version that prints the command that failed.
 #define CHECK_CUDA_ERROR(cmd) check_cuda_error(#cmd, (cmd))
--- a/mlx/backend/cuda/worker.cpp
+++ b/mlx/backend/cuda/worker.cpp
@@ -80,7 +80,9 @@ void Worker::thread_fn() {
      }
      worker_tasks_.erase(worker_tasks_.begin(), end);
    }
-    for (auto& task : tasks) {
+    // Make sure tasks are cleared before the next wait
+    for (int i = 0; i < tasks.size(); ++i) {
+      auto task = std::move(tasks[i]);
      task();
    }
    worker_event_.wait(batch + 1);
--- a/mlx/backend/metal/kernels/layer_norm.metal
+++ b/mlx/backend/metal/kernels/layer_norm.metal
@@ -31,6 +31,7 @@ inline void threadgroup_sum(
  for (int i = 0; i < N; i++) {
    x[i] = simd_sum(x[i]);
  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_lane_id == 0) {
    for (int i = 0; i < N; i++) {
      xs[N * simd_group_id + i] = x[i];
--- a/mlx/compile.cpp
+++ b/mlx/compile.cpp
@@ -245,6 +245,30 @@ void merge(array& dst, array& src, ParentsMap& parents_map) {
  }
 }

+// Any parent in the divider will continue to refer to `x` but any parent not
+// in the divider will refer to a copy of the operation.
+array split_one(
+    const array& x,
+    ParentsMap& parents_map,
+    const std::unordered_set<uintptr_t>& divider) {
+  array y(x.shape(), x.dtype(), x.primitive_ptr(), x.inputs());
+
+  auto& x_parents = parents_map[x.id()];
+  auto& y_parents = parents_map[y.id()];
+
+  for (auto it = x_parents.begin(); it != x_parents.end();) {
+    if (divider.find(it->first.id()) != divider.end()) {
+      it->first.inputs()[it->second] = y;
+      y_parents.emplace_back(std::move(*it));
+      it = x_parents.erase(it);
+    } else {
+      it++;
+    }
+  }
+
+  return std::move(y);
+}
+
 template <typename T, typename... U>
 std::uintptr_t get_function_address(const std::function<T(U...)>& fun) {
  using FunType = T (*)(U...);
@@ -669,10 +693,16 @@ void compile_fuse(
      }

      // Arrays with a mix of parents outside the compilable section
-      // are not fusable
+      // are not fusable except for broadcast which we can split to avoid
+      // stopping fusion
      if (!all_parents_in) {
-        // Possible input
-        input_set.insert(a.id());
+        if (a.has_primitive() && is_broadcast(a.primitive())) {
+          array b = split_one(a, parents_map, cache);
+          recurse(b, depth, s, shape);
+        } else {
+          // Possible input
+          input_set.insert(a.id());
+        }
        return;
      }

--- a/mlx/dtype_utils.cpp
+++ b/mlx/dtype_utils.cpp
@@ -5,16 +5,38 @@
 namespace mlx::core {

 const char* dtype_to_string(Dtype arg) {
-  if (arg == bool_) {
-    return "bool";
+  switch (arg) {
+    case bool_:
+      return "bool";
+    case int8:
+      return "int8";
+    case int16:
+      return "int16";
+    case int32:
+      return "int32";
+    case int64:
+      return "int64";
+    case uint8:
+      return "uint8";
+    case uint16:
+      return "uint16";
+    case uint32:
+      return "uint32";
+    case uint64:
+      return "uint64";
+    case float16:
+      return "float16";
+    case bfloat16:
+      return "bfloat16";
+    case float32:
+      return "float32";
+    case float64:
+      return "float64";
+    case complex64:
+      return "complex64";
+    default:
+      return "unknown";
  }
-#define SPECIALIZE_DtypeToString(CPP_TYPE, DTYPE) \
-  if (DTYPE == arg) {                             \
-    return #DTYPE;                                \
-  }
-  MLX_FORALL_DTYPES(SPECIALIZE_DtypeToString)
-#undef SPECIALIZE_DtypeToString
-  return "(unknown)";
 }

 } // namespace mlx::core
--- a/mlx/dtype_utils.h
+++ b/mlx/dtype_utils.h
@@ -1,207 +1,106 @@
 // Copyright © 2025 Apple Inc.
-// Copyright © Meta Platforms, Inc. and affiliates.
-//
-// This source code is licensed under the BSD-style license found in
-// https://github.com/pytorch/executorch/blob/main/LICENSE
-//
-// Forked from
-// https://github.com/pytorch/executorch/blob/main/runtime/core/exec_aten/util/scalar_type_util.h

 #pragma once

-#include "mlx/dtype.h"
+#include <sstream>

-#include <fmt/format.h>
+#include "mlx/dtype.h"
+#include "mlx/utils.h"

 namespace mlx::core {

 // Return string representation of dtype.
 const char* dtype_to_string(Dtype arg);

-// Macros that iterate across different subsets of Dtypes.
-//
-// For all of these macros, the final `_` parameter is the name of another macro
-// that takes two parameters: the name of a C type, and the name of the
-// corresponding Dtype enumerator.
-//
-// Note that these macros should use fully-qualified namespaces (starting with
-// `::`) to ensure that they can be called safely in any arbitrary namespace.
-#define MLX_FORALL_INT_TYPES(_) \
-  _(uint8_t, uint8)             \
-  _(uint16_t, uint16)           \
-  _(uint32_t, uint32)           \
-  _(uint64_t, uint64)           \
-  _(int8_t, int8)               \
-  _(int16_t, int16)             \
-  _(int32_t, int32)             \
-  _(int64_t, int64)
+#define MLX_INTERNAL_DTYPE_SWITCH_CASE(DTYPE, TYPE) \
+  case DTYPE:                                       \
+    f(type_identity<TYPE>{});                       \
+    break

-#define MLX_FORALL_FLOAT_TYPES(_) \
-  _(float16_t, float16)           \
-  _(float, float32)               \
-  _(double, float64)              \
-  _(bfloat16_t, bfloat16)
+#define MLX_INTERNAL_DTYPE_SWITCH_INTS()            \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(int8, int8_t);     \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(int16, int16_t);   \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(int32, int32_t);   \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(int64, int64_t);   \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(uint8, uint8_t);   \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(uint16, uint16_t); \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(uint32, uint32_t); \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(uint64, uint64_t)

-// Calls the provided macro on every Dtype, providing the C type and the
-// Dtype name to each call.
-//
-// @param _ A macro that takes two parameters: the name of a C type, and the
-//          name of the corresponding Dtype enumerator.
-#define MLX_FORALL_DTYPES(_) \
-  MLX_FORALL_INT_TYPES(_)    \
-  MLX_FORALL_FLOAT_TYPES(_)  \
-  _(bool, bool_)             \
-  _(complex64_t, complex64)
+#define MLX_INTERNAL_DTYPE_SWITCH_FLOATS()              \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(float16, float16_t);   \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(bfloat16, bfloat16_t); \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(float32, float);       \
+  MLX_INTERNAL_DTYPE_SWITCH_CASE(float64, double)

-// Maps Dtypes to C++ types.
-template <Dtype::Val N>
-struct DtypeToCppType;
-
-#define SPECIALIZE_DtypeToCppType(CPP_TYPE, DTYPE) \
-  template <>                                      \
-  struct DtypeToCppType<Dtype::Val::DTYPE> {       \
-    using type = CPP_TYPE;                         \
-  };
-
-MLX_FORALL_DTYPES(SPECIALIZE_DtypeToCppType)
-
-#undef SPECIALIZE_DtypeToCppType
-
-// Maps C++ types to Dtypes.
+// This already exists in C++20 but in C++20 we can also just use templated
+// lambdas which will make this so much nicer.
 template <typename T>
-struct CppTypeToDtype;
+struct type_identity {
+  using type = T;
+};

-#define SPECIALIZE_CppTypeToDtype(CPP_TYPE, DTYPE) \
-  template <>                                      \
-  struct CppTypeToDtype<CPP_TYPE>                  \
-      : std::integral_constant<Dtype::Val, Dtype::Val::DTYPE> {};
+#define MLX_GET_TYPE(x) typename decltype(x)::type
+#define MLX_GET_VALUE(x) decltype(x)::value

-MLX_FORALL_DTYPES(SPECIALIZE_CppTypeToDtype)
-
-#undef SPECIALIZE_CppTypeToDtype
-
-// Helper macros for switch case macros (see below)
-//
-// These macros are not meant to be used directly. They provide an easy way to
-// generate a switch statement that can handle subsets of Dtypes supported.
-
-#define MLX_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...)         \
-  case enum_type: {                                                   \
-    using CTYPE_ALIAS = ::mlx::core::DtypeToCppType<enum_type>::type; \
-    __VA_ARGS__;                                                      \
-    break;                                                            \
+template <typename F>
+void dispatch_all_types(Dtype dt, F&& f) {
+  switch (dt) {
+    MLX_INTERNAL_DTYPE_SWITCH_CASE(bool_, bool);
+    MLX_INTERNAL_DTYPE_SWITCH_INTS();
+    MLX_INTERNAL_DTYPE_SWITCH_FLOATS();
+    MLX_INTERNAL_DTYPE_SWITCH_CASE(complex64, complex64_t);
  }
+}

-#define MLX_INTERNAL_SWITCH_CHECKED(TYPE, NAME, ...)                  \
-  switch (TYPE) {                                                     \
-    __VA_ARGS__                                                       \
-    default:                                                          \
-      throw std::invalid_argument(fmt::format(                        \
-          "Unhandled dtype %s for %s", dtype_to_string(TYPE), NAME)); \
+template <typename F>
+void dispatch_int_types(Dtype dt, std::string_view tag, F&& f) {
+  switch (dt) {
+    MLX_INTERNAL_DTYPE_SWITCH_INTS();
+    default:
+      std::ostringstream msg;
+      msg << tag << " Only integer types supported but " << dt
+          << " was provided";
+      throw std::invalid_argument(msg.str());
  }
+}

-#define MLX_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...)     \
-  MLX_INTERNAL_SWITCH_CASE(                                      \
-      ::mlx::core::Dtype::Val::uint8, CTYPE_ALIAS, __VA_ARGS__)  \
-  MLX_INTERNAL_SWITCH_CASE(                                      \
-      ::mlx::core::Dtype::Val::uint16, CTYPE_ALIAS, __VA_ARGS__) \
-  MLX_INTERNAL_SWITCH_CASE(                                      \
-      ::mlx::core::Dtype::Val::uint32, CTYPE_ALIAS, __VA_ARGS__) \
-  MLX_INTERNAL_SWITCH_CASE(                                      \
-      ::mlx::core::Dtype::Val::uint64, CTYPE_ALIAS, __VA_ARGS__) \
-  MLX_INTERNAL_SWITCH_CASE(                                      \
-      ::mlx::core::Dtype::Val::int8, CTYPE_ALIAS, __VA_ARGS__)   \
-  MLX_INTERNAL_SWITCH_CASE(                                      \
-      ::mlx::core::Dtype::Val::int16, CTYPE_ALIAS, __VA_ARGS__)  \
-  MLX_INTERNAL_SWITCH_CASE(                                      \
-      ::mlx::core::Dtype::Val::int32, CTYPE_ALIAS, __VA_ARGS__)  \
-  MLX_INTERNAL_SWITCH_CASE(                                      \
-      ::mlx::core::Dtype::Val::int64, CTYPE_ALIAS, __VA_ARGS__)
+template <typename F>
+void dispatch_float_types(Dtype dt, std::string_view tag, F&& f) {
+  switch (dt) {
+    MLX_INTERNAL_DTYPE_SWITCH_FLOATS();
+    default:
+      std::ostringstream msg;
+      msg << tag << " Only float types supported but " << dt << " was provided";
+      throw std::invalid_argument(msg.str());
+  }
+}

-#define MLX_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, ...)    \
-  MLX_INTERNAL_SWITCH_CASE(                                       \
-      ::mlx::core::Dtype::Val::float16, CTYPE_ALIAS, __VA_ARGS__) \
-  MLX_INTERNAL_SWITCH_CASE(                                       \
-      ::mlx::core::Dtype::Val::float32, CTYPE_ALIAS, __VA_ARGS__) \
-  MLX_INTERNAL_SWITCH_CASE(                                       \
-      ::mlx::core::Dtype::Val::float64, CTYPE_ALIAS, __VA_ARGS__) \
-  MLX_INTERNAL_SWITCH_CASE(                                       \
-      ::mlx::core::Dtype::Val::bfloat16, CTYPE_ALIAS, __VA_ARGS__)
+template <typename F>
+void dispatch_int_float_types(Dtype dt, std::string_view tag, F&& f) {
+  switch (dt) {
+    MLX_INTERNAL_DTYPE_SWITCH_INTS();
+    MLX_INTERNAL_DTYPE_SWITCH_FLOATS();
+    default:
+      std::ostringstream msg;
+      msg << tag << " Only integer and float types supported but " << dt
+          << " was provided";
+      throw std::invalid_argument(msg.str());
+  }
+}

-#define MLX_INTERNAL_SWITCH_CASE_INT_FLOAT_TYPES(CTYPE_ALIAS, ...) \
-  MLX_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__)     \
-  MLX_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__)
-
-#define MLX_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, ...)        \
-  MLX_INTERNAL_SWITCH_CASE_INT_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__) \
-  MLX_INTERNAL_SWITCH_CASE(                                          \
-      ::mlx::core::Dtype::Val::bool_, CTYPE_ALIAS, __VA_ARGS__)
-
-#define MLX_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, ...) \
-  MLX_INTERNAL_SWITCH_CASE(                                      \
-      ::mlx::core::Dtype::Val::complex64, CTYPE_ALIAS, __VA_ARGS__)
-
-#define MLX_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, ...)    \
-  MLX_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__) \
-  MLX_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, __VA_ARGS__)
-
-// Switch case macros
-//
-// These macros provide an easy way to generate switch statements that apply a
-// common lambda function to subsets of Dtypes supported by MLX.
-// The lambda function can type specialize to the ctype associated with the
-// Dtype being handled through an alias passed as the CTYPE_ALIAS argument.
-//
-// Arguments:
-//   - ADDITIONAL: Additional Dtype case to add
-//   - TYPE: The Dtype to handle through the switch statement
-//   - NAME: A name for this operation which will be used in error messages
-//   - CTYPE_ALIAS: A typedef for the ctype associated with the Dtype.
-//   - ...: A statement to be applied to each Dtype case
-//
-// An example usage is:
-//
-// MLX_SWITCH_ALL_TYPES(input.dtype(), CTYPE, {
-//   output.data<CTYPE>[0] = input.data<CTYPE>[0];
-// });
-//
-// Note that these can be nested as well:
-//
-// MLX_SWITCH_ALL_TYPES(input.dtype(), CTYPE_IN, {
-//   MLX_SWITCH_ALL_TYPES(output.dtype(), CTYPE_OUT, {
-//     output.data<CTYPE_OUT>[0] = input.data<CTYPE_IN>[0];
-//   });
-// });
-//
-// These macros are adapted from Dispatch.h in the ATen library. The primary
-// difference is that the CTYPE_ALIAS argument is exposed to users, which is
-// used to alias the ctype associated with the Dtype that is being handled.
-
-#define MLX_SWITCH_ALL_TYPES(TYPE, CTYPE_ALIAS, ...) \
-  switch (TYPE) { MLX_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, __VA_ARGS__) }
-
-#define MLX_SWITCH_INT_TYPES_CHECKED(TYPE, NAME, CTYPE_ALIAS, ...) \
-  MLX_INTERNAL_SWITCH_CHECKED(                                     \
-      TYPE,                                                        \
-      NAME,                                                        \
-      MLX_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__))
-
-#define MLX_SWITCH_FLOAT_TYPES_CHECKED(TYPE, NAME, CTYPE_ALIAS, ...) \
-  MLX_INTERNAL_SWITCH_CHECKED(                                       \
-      TYPE,                                                          \
-      NAME,                                                          \
-      MLX_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__))
-
-#define MLX_SWITCH_INT_FLOAT_TYPES_CHECKED(TYPE, NAME, CTYPE_ALIAS, ...) \
-  MLX_INTERNAL_SWITCH_CHECKED(                                           \
-      TYPE,                                                              \
-      NAME,                                                              \
-      MLX_INTERNAL_SWITCH_CASE_INT_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__))
-
-#define MLX_SWITCH_REAL_TYPES_CHECKED(TYPE, NAME, CTYPE_ALIAS, ...) \
-  MLX_INTERNAL_SWITCH_CHECKED(                                      \
-      TYPE,                                                         \
-      NAME,                                                         \
-      MLX_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+template <typename F>
+void dispatch_real_types(Dtype dt, std::string_view tag, F&& f) {
+  switch (dt) {
+    MLX_INTERNAL_DTYPE_SWITCH_CASE(bool_, bool);
+    MLX_INTERNAL_DTYPE_SWITCH_INTS();
+    MLX_INTERNAL_DTYPE_SWITCH_FLOATS();
+    default:
+      std::ostringstream msg;
+      msg << tag << " Only real numbers supported but " << dt
+          << " was provided";
+      throw std::invalid_argument(msg.str());
+  }
+}

 } // namespace mlx::core
--- a/mlx/linalg.cpp
+++ b/mlx/linalg.cpp
@@ -688,7 +688,7 @@ array solve(const array& a, const array& b, StreamOrDevice s /* = {} */) {
    perm = expand_dims(perm, -1, s);
    take_axis -= 1;
  }
-  auto pb = take_along_axis(b, perm, take_axis);
+  auto pb = take_along_axis(b, perm, take_axis, s);
  auto y = solve_triangular(luf[1], pb, /* upper = */ false, s);
  return solve_triangular(luf[2], y, /* upper = */ true, s);
 }
--- a/mlx/utils.cpp
+++ b/mlx/utils.cpp
@@ -253,7 +253,9 @@ std::ostream& operator<<(std::ostream& os, const Dtype::Kind& k) {

 std::ostream& operator<<(std::ostream& os, array a) {
  a.eval();
-  MLX_SWITCH_ALL_TYPES(a.dtype(), CTYPE, print_array<CTYPE>(os, a));
+  dispatch_all_types(a.dtype(), [&](auto type_tag) {
+    print_array<MLX_GET_TYPE(type_tag)>(os, a);
+  });
  return os;
 }

@@ -321,8 +323,9 @@ void set_iinfo_limits(int64_t& min, uint64_t& max) {
 }

 iinfo::iinfo(Dtype dtype) : dtype(dtype) {
-  MLX_SWITCH_INT_TYPES_CHECKED(
-      dtype, "[iinfo]", CTYPE, set_iinfo_limits<CTYPE>(min, max));
+  dispatch_int_types(dtype, "[iinfo]", [&](auto type_tag) {
+    set_iinfo_limits<MLX_GET_TYPE(type_tag)>(min, max);
+  });
 }

 } // namespace mlx::core
--- a/mlx/version.h
+++ b/mlx/version.h
@@ -4,7 +4,7 @@

 #define MLX_VERSION_MAJOR 0
 #define MLX_VERSION_MINOR 26
-#define MLX_VERSION_PATCH 1
+#define MLX_VERSION_PATCH 2
 #define MLX_VERSION_NUMERIC \
  (100000 * MLX_VERSION_MAJOR + 1000 * MLX_VERSION_MINOR + MLX_VERSION_PATCH)

--- a/python/mlx/extension.py
+++ b/python/mlx/extension.py
@@ -53,11 +53,7 @@ class CMakeBuild(build_ext):
        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
        # across all generators.
        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
-            # self.parallel is a Python 3 only way to set parallel jobs by hand
-            # using -j in the build_ext call, not supported by pip or PyPA-build.
-            if hasattr(self, "parallel") and self.parallel:
-                # CMake 3.12+ only.
-                build_args += [f"-j{self.parallel}"]
+            build_args += [f"-j{os.cpu_count()}"]

        build_temp = Path(self.build_temp) / ext.name
        if not build_temp.exists():
--- a/python/mlx/nn/layers/activations.py
+++ b/python/mlx/nn/layers/activations.py
@@ -546,7 +546,7 @@ class GELU(Module):

    See :func:`gelu`, :func:`gelu_approx` and :func:`gelu_fast_approx` for the
    functional equivalents and information regarding error bounds.
-    
+

    Args:
        approx ('none' | 'precise' | 'fast'): Which approximation to gelu to use if any.
@@ -554,20 +554,19 @@ class GELU(Module):

    def __init__(self, approx="none"):
        super().__init__()
-
-        if approx == "none":
-            self._act = gelu
-        elif approx == "precise" or approx == "tanh":
-            self._act = gelu_approx
-        elif approx == "fast":
-            self._act = gelu_fast_approx
-        else:
+        self._approx = approx
+        allowed = ["none", "precise", "tanh", "fast"]
+        if approx not in allowed:
            raise ValueError(
-                f"The approximation should be in ['none', 'precise', 'tanh', 'fast'] but '{approx}' was given"
+                f"The approximation should be in {allowed} but '{approx}' was given"
            )

    def __call__(self, x):
-        return self._act(x)
+        if self._approx == "none":
+            return gelu(x)
+        elif self._approx in ["precise", "tanh"]:
+            return gelu_approx(x)
+        return gelu_fast_approx(x)


@_make_activation_module(tanh)
--- a/python/mlx/nn/layers/base.py
+++ b/python/mlx/nn/layers/base.py
@@ -114,6 +114,12 @@ class Module(dict):
            super(Module, self).__setattr__(key, val)
            self.pop(key, None)

+    def __delattr__(self, name):
+        if (val := self.get(name, None)) is not None:
+            del self[name]
+        else:
+            super().__delattr__(name)
+
    def load_weights(
        self,
        file_or_weights: Union[str, List[Tuple[str, mx.array]]],
@@ -404,7 +410,7 @@ class Module(dict):
                            dst[k] = new_value
                        elif isinstance(current_value, (dict, list)):
                            apply(current_value, new_value)
-                        elif strict:
+                        elif strict and new_value != {}:
                            raise ValueError(
                                f"Received invalid type: {type(new_value).__name__}."
                            )
@@ -413,14 +419,14 @@ class Module(dict):
                            f'Module does not have sub-module named "{k}".'
                        )
            elif isinstance(modules, list):
-                for i in range(len(dst)):
+                for i in range(len(modules)):
                    current_value = dst[i]
                    new_value = modules[i]
                    if self.is_module(current_value) and self.is_module(new_value):
                        dst[i] = new_value
                    elif isinstance(current_value, (dict, list)):
                        apply(current_value, new_value)
-                    elif strict:
+                    elif strict and new_value != {}:
                        raise ValueError(
                            f"Received invalid type: {type(new_value).__name__}."
                        )
--- a/python/scripts/repair_cuda.sh
+++ b/python/scripts/repair_cuda.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+auditwheel repair dist/* \
+  --plat manylinux_2_35_x86_64 \
+  --exclude libcublas* \
+  --exclude libnvrtc*
+
+cd wheelhouse
+repaired_wheel=$(find . -name "*.whl" -print -quit)
+unzip -q "${repaired_wheel}"
+core_so=$(find mlx -name "core*.so" -print -quit)
+rpath=$(patchelf --print-rpath "${core_so}")
+rpath=$rpath:\$ORIGIN/../nvidia/cublas/lib:\$ORIGIN/../nvidia/cuda_nvrtc/lib
+patchelf --force-rpath --set-rpath "$rpath" "$core_so"
+
+# Re-zip the repaired wheel
+zip -r -q "${repaired_wheel}" .
--- a/python/src/convert.cpp
+++ b/python/src/convert.cpp
@@ -205,6 +205,8 @@ nb::object to_scalar(mx::array& a) {
      return nb::cast(static_cast<float>(a.item<mx::bfloat16_t>()));
    case mx::complex64:
      return nb::cast(a.item<std::complex<float>>());
+    case mx::float64:
+      return nb::cast(a.item<double>());
    default:
      throw nb::type_error("type cannot be converted to Python scalar.");
  }
--- a/python/src/fast.cpp
+++ b/python/src/fast.cpp
@@ -175,11 +175,12 @@ void init_fast(nb::module_& parent_module) {
        * `Grouped Query Attention <https://arxiv.org/abs/2305.13245>`_
        * `Multi-Query Attention <https://arxiv.org/abs/1911.02150>`_

-        Note: The softmax operation is performed in ``float32`` regardless of
-        the input precision.
+        .. note::

-        Note: For Grouped Query Attention and Multi-Query Attention, the ``k``
-        and ``v`` inputs should not be pre-tiled to match ``q``.
+          * The softmax operation is performed in ``float32`` regardless of
+            the input precision.
+          * For Grouped Query Attention and Multi-Query Attention, the ``k``
+            and ``v`` inputs should not be pre-tiled to match ``q``.

        In the following the dimensions are given by:

@@ -195,13 +196,30 @@ void init_fast(nb::module_& parent_module) {
            k (array): Keys with shape ``[B, N_kv, T_kv, D]``.
            v (array): Values with shape ``[B, N_kv, T_kv, D]``.
            scale (float): Scale for queries (typically ``1.0 / sqrt(q.shape(-1)``)
-            mask (Union[None, str, array], optional): A causal, boolean or additive
-               mask to apply to the query-key scores. The mask can have at most 4
-               dimensions and must be broadcast-compatible with the shape
-               ``[B, N, T_q, T_kv]``. If an additive mask is given its type must
-               promote to the promoted type of ``q``, ``k``, and ``v``.
+            mask (Union[None, str, array], optional): The mask to apply to the
+               query-key scores. The mask can be an array or a string indicating
+               the mask type. The only supported string type is ``"causal"``. If
+               the mask is an array it can be a boolean or additive mask. The mask
+               can have at most 4 dimensions and must be broadcast-compatible with
+               the shape ``[B, N, T_q, T_kv]``. If an additive mask is given its
+               type must promote to the promoted type of ``q``, ``k``, and ``v``.
        Returns:
            array: The output array.
+
+        Example:
+
+          .. code-block:: python
+
+            B = 2
+            N_q = N_kv = 32
+            T_q = T_kv = 1000
+            D = 128
+
+            q = mx.random.normal(shape=(B, N_q, T_q, D))
+            k = mx.random.normal(shape=(B, N_kv, T_kv, D))
+            v = mx.random.normal(shape=(B, N_kv, T_kv, D))
+            scale = D ** -0.5
+            out = mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask="causal")
      )pbdoc");

  m.def(
--- a/python/tests/cuda_skip.py
+++ b/python/tests/cuda_skip.py
@@ -1,43 +1,15 @@
 cuda_skip = {
-    "TestArray.test_api",
-    "TestAutograd.test_update_state",
-    "TestBF16.test_arg_reduction_ops",
-    "TestBF16.test_reduction_ops",
-    "TestBlas.test_complex_gemm",
-    "TestCompile.test_compile_dynamic_dims",
-    "TestEinsum.test_ellipses",
-    "TestEinsum.test_opt_einsum_test_cases",
    "TestLoad.test_load_f8_e4m3",
-    "TestMemory.test_memory_info",
-    "TestLayers.test_group_norm",
-    "TestLayers.test_pooling",
    "TestLayers.test_quantized_embedding",
-    "TestLayers.test_sin_pe",
-    "TestLayers.test_upsample",
-    "TestOps.test_array_equal",
-    "TestOps.test_complex_ops",
    "TestOps.test_dynamic_slicing",
-    "TestOps.test_softmax",
-    "TestOps.test_sort",
-    "TestOps.test_tile",
-    "TestReduce.test_axis_permutation_sums",
    "TestReduce.test_dtypes",
-    "TestReduce.test_expand_sums",
-    "TestReduce.test_many_reduction_axes",
-    "TestUpsample.test_torch_upsample",
-    # DivMod NYI
-    "TestOps.test_divmod",
-    "TestEval.test_multi_output_eval_during_transform",
-    # Partition NYI
-    "TestAutograd.test_topk_grad",
-    "TestOps.test_argpartition",
-    "TestOps.test_partition",
    # Block masked matmul NYI
    "TestBlas.test_block_masked_matmul",
    # Gather matmul NYI
    "TestBlas.test_gather_matmul",
    "TestBlas.test_gather_matmul_grad",
    # Scan NYI
+    "TestArray.test_api",
    "TestAutograd.test_cumprod_grad",
    "TestOps.test_scans",
    "TestOps.test_logcumsumexp",
--- a/python/tests/mlx_tests.py
+++ b/python/tests/mlx_tests.py
@@ -1,6 +1,10 @@
 # Copyright © 2023 Apple Inc.

 import os
+
+# Use regular fp32 precision for tests
+os.environ["MLX_ENABLE_TF32"] = "0"
+
 import platform
 import unittest
 from typing import Any, Callable, List, Tuple, Union
--- a/python/tests/test_compile.py
+++ b/python/tests/test_compile.py
@@ -2,8 +2,10 @@

 import gc
 import io
+import math
 import unittest
 from functools import partial
+from io import StringIO

 import mlx.core as mx
 import mlx_tests
@@ -979,6 +981,39 @@ class TestCompile(mlx_tests.MLXTestCase):

        self.assertEqual(mem_pre, mem_post)

+    def test_double_constant(self):
+        with mx.stream(mx.cpu):
+            x = mx.array(1.0, dtype=mx.float64)
+
+            def fun(x):
+                return (x + math.pi) * 2.0
+
+            y = fun(x).item()
+            y_compiled = mx.compile(fun)(x).item()
+            self.assertEqual(y, y_compiled)
+
+    def test_shared_broadcast(self):
+        def fun(x, y, z):
+            yy = mx.broadcast_to(y, z.shape)
+            return (x + yy * z), yy.sum()
+
+        a = mx.random.normal((10, 10))
+        b = mx.array(0.1)
+        c = mx.random.normal((10, 10))
+        mx.eval(a, b, c)
+        fc = mx.compile(fun)
+        d = fc(a, b, c)
+
+        s = StringIO()
+        mx.export_to_dot(s, a=a, b=b, c=c, d1=d[0], d2=d[1])
+        s.seek(0)
+        s = s.read()
+
+        self.assertTrue("CompiledBroadcastMultiplyAdd" in s)
+        d_hat = fun(a, b, c)
+        self.assertTrue(mx.allclose(d[0], d_hat[0]))
+        self.assertTrue(mx.allclose(d[1], d_hat[1]))
+

 if __name__ == "__main__":
    mlx_tests.MLXTestRunner()
--- a/python/tests/test_load.py
+++ b/python/tests/test_load.py
@@ -391,9 +391,11 @@ class TestLoad(mlx_tests.MLXTestCase):
        scale = mx.array(2.0)
        y = mx.load(save_file)
        mx.eval(y)
+        mx.synchronize()
        load_only = mx.get_peak_memory()
        y = mx.load(save_file) * scale
        mx.eval(y)
+        mx.synchronize()
        load_with_binary = mx.get_peak_memory()

        self.assertEqual(load_only, load_with_binary)
--- a/python/tests/test_nn.py
+++ b/python/tests/test_nn.py
@@ -259,6 +259,26 @@ class TestBase(mlx_tests.MLXTestCase):
        with self.assertRaises(ValueError):
            m = m.update_modules({"list": ["hi"]})

+        # Allow updating a strict subset
+        m = nn.Sequential(nn.Linear(3, 3), nn.Linear(3, 3))
+        m.update_modules({"layers": [{}, nn.Linear(3, 4)]})
+        self.assertEqual(m.layers[1].weight.shape, (4, 3))
+
+        # Using leaf_modules in the update should always work
+        class MyModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.stuff = [nn.Linear(2, 2), 0, nn.Linear(2, 2)]
+                self.more_stuff = {"hi": nn.Linear(2, 2), "bye": 0}
+
+        m = MyModel()
+        m.update_modules(m.leaf_modules())
+
+    def test_parameter_deletion(self):
+        m = nn.Linear(32, 32)
+        del m.weight
+        self.assertFalse(hasattr(m, "weight"))
+

 class TestLayers(mlx_tests.MLXTestCase):
    def test_identity(self):
--- a/setup.py
+++ b/setup.py
@@ -97,11 +97,7 @@ class CMakeBuild(build_ext):
        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
        # across all generators.
        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
-            # self.parallel is a Python 3 only way to set parallel jobs by hand
-            # using -j in the build_ext call, not supported by pip or PyPA-build.
-            if hasattr(self, "parallel") and self.parallel:
-                # CMake 3.12+ only.
-                build_args += [f"-j{self.parallel}"]
+            build_args += [f"-j{os.cpu_count()}"]

        build_temp = Path(self.build_temp) / ext.name
        if not build_temp.exists():
@@ -174,20 +170,26 @@ if __name__ == "__main__":
    )
    package_dir = {"": "python"}
    package_data = {"mlx": ["lib/*", "include/*", "share/*"], "mlx.core": ["*.pyi"]}
+    install_requires = []
+    build_cuda = "MLX_BUILD_CUDA=ON" in os.environ.get("CMAKE_ARGS", "")
+    if build_cuda:
+        install_requires = ["nvidia-cublas-cu12", "nvidia-cuda-nvrtc-cu12"]

    setup(
-        name="mlx",
+        name="mlx-cuda" if build_cuda else "mlx",
        version=get_version(),
        author="MLX Contributors",
        author_email="mlx@group.apple.com",
        description="A framework for machine learning on Apple silicon.",
        long_description=long_description,
        long_description_content_type="text/markdown",
+        license="MIT",
        url="https://github.com/ml-explore/mlx",
        packages=packages,
        package_dir=package_dir,
        package_data=package_data,
        include_package_data=True,
+        install_requires=install_requires,
        extras_require={
            "dev": [
                "nanobind==2.4.0",
Author	SHA1	Message	Date
Awni Hannun	a4fcc893cd	auto build linux release (#2341 )	2025-07-07 09:29:23 -07:00
Cheng	9d10239af7	[CUDA] Do vectorized store/load in binary ops (#2330 )	2025-07-07 08:44:14 -07:00
Cheng	19facd4b20	Build with all cpu cores by default (#2336 )	2025-07-07 06:06:45 -07:00
Angelos Katharopoulos	f5299f72cd	Fix layernorm race condition (#2340 )	2025-07-07 06:06:01 -07:00
Cheng	0e0d9ac522	[CUDA] Add MLX_CUDA_GRAPH_CACHE_SIZE env for setting graph cache size (#2329 )	2025-07-05 08:33:29 -07:00
Awni Hannun	8917022deb	fix graphs for older cuda (#2328 )	2025-07-02 19:37:58 -07:00
Awni Hannun	ec0d5db67b	[CUDA] Switch to CUDA graphs (#2317 ) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment	2025-07-02 15:59:13 -07:00
Cheng	e76e9b87f0	Fix compilation error from integral_constant (#2326 )	2025-07-02 06:04:38 -07:00
Awni Hannun	cfb6a244ea	allow parameters to be deleted (#2325 )	2025-07-01 21:27:23 -07:00
Awni Hannun	58f3860306	patch bump (#2324 )	2025-07-01 12:12:16 -07:00
Awni Hannun	dd4f53db63	use fp32 for testing, add more complex ops (#2322 )	2025-07-01 07:30:00 -07:00
Angelos Katharopoulos	3d5e17e507	MLX_SWITCH macros to templates (#2320 )	2025-07-01 01:33:44 -07:00
Awni Hannun	33bf1a244b	Fix module update in strict mode (#2321 ) * fix module update in strict mode * allow GELU to be pickled	2025-06-29 11:12:29 -07:00
Angelos Katharopoulos	772f471ff2	[CUDA] Fix reductions (#2314 )	2025-06-27 12:59:20 -07:00
Angelos Katharopoulos	2c11d10f8d	Split broadcast so it is always fused in compile (#2318 )	2025-06-26 22:08:18 -07:00
Angelos Katharopoulos	656ed7f780	Fix get 2d grid dims (#2316 )	2025-06-25 13:03:09 -07:00
Awni Hannun	81bb9a2a9e	Compile float64 functions on CPU (#2311 )	2025-06-24 10:18:52 -07:00
Angelos Katharopoulos	5adf185f86	Fix `update_modules()` when providing a subset (#2308 )	2025-06-20 17:19:46 -07:00
Awni Hannun	c9a9180584	Cuda perf tuning (#2307 ) * perf tuning * fix adding inputs arrays in matmul / srot * format * fix	2025-06-20 14:50:57 -07:00
Awni Hannun	76831ed83d	Build CUDA release in Circle (#2306 ) * cuda release * add license	2025-06-19 15:26:36 -07:00
Angelos Katharopoulos	b3d7b85376	Make ptx cache settable by environment variable (#2304 )	2025-06-17 23:55:56 -07:00
Awni Hannun	cad5c0241c	[CUDA] synch properly waits for all tasks to finish and clear (#2303 ) * cuda synch properly waits for all tasks to finish and clear * fix copy	2025-06-17 12:03:25 -07:00
Awni Hannun	b8022c578a	divmod, partition, sort fixes (#2302 )	2025-06-16 18:49:32 -07:00