Add gradient for the scales and biases in gather qmm

Improve the gradient of gather_qmm as well
Disable the test for CUDA
2025-12-16 01:49:05 +08:00 · 2025-07-05 00:58:17 -07:00 · 2025-07-04 20:23:58 -07:00 · 2025-07-04 19:17:45 -07:00 · 2025-07-04 18:36:20 -07:00 · 2025-07-04 13:16:54 -07:00
53 changed files with 1325 additions and 1798 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -41,7 +41,7 @@ jobs:
            pip install --upgrade pip
            pip install --upgrade cmake
            pip install -r docs/requirements.txt
-            pip install . -v
+            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install . -v
      - when:
          condition:
            not: << parameters.upload-docs >>
@@ -97,8 +97,10 @@ jobs:
          name: Install Python package
          command: |
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              python3 setup.py build_ext --inplace
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              python3 setup.py develop
      - run:
          name: Generate package stubs
@@ -155,7 +157,8 @@ jobs:
          name: Install Python package
          command: |
            source env/bin/activate
-            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+            DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+            CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
              pip install -e . -v
      - run:
          name: Generate package stubs
@@ -205,7 +208,8 @@ jobs:
          name: Run Python tests with JIT
          command: |
            source env/bin/activate
-            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+              CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
              pip install -e . -v
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
              METAL_DEBUG_ERROR_MODE=0 \
@@ -224,7 +228,8 @@ jobs:
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
            python -m venv env
            source env/bin/activate
-            CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+            CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              pip install -e ".[dev]"
      - run:
          name: Run Python tests
@@ -273,6 +278,7 @@ jobs:
          command: |
            source env/bin/activate
            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
+              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
              pip install . -v
      - run:
          name: Generate package stubs
@@ -284,7 +290,9 @@ jobs:
          name: Build Python package
          command: |
            source env/bin/activate
-            << parameters.build_env >> python -m build -w
+            << parameters.build_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+              python -m build -w
      - when:
          condition: << parameters.build_env >>
          steps:
@@ -332,10 +340,14 @@ jobs:
            pip install patchelf
            pip install build
            pip install twine
-            << parameters.extra_env >> pip install . -v
+            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              pip install . -v
            pip install typing_extensions
            python setup.py generate_stubs
-            << parameters.extra_env >> python -m build --wheel
+            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              python -m build --wheel
            auditwheel show dist/*
            auditwheel repair dist/* --plat manylinux_2_31_x86_64
      - run:
@@ -371,10 +383,12 @@ jobs:
            pip install build
            pip install twine
            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              pip install ".[dev]" -v
            python setup.py generate_stubs
            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              python -m build --wheel
            bash python/scripts/repair_cuda.sh
@@ -492,16 +506,6 @@ workflows:
            branches:
              ignore: /.*/
          upload-docs: true
-      - build_linux_release:
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-          matrix:
-            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              extra_env: ["PYPI_RELEASE=1"]

  prb:
    when:
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -88,20 +88,20 @@ Then simply build and install MLX using pip:

 .. code-block:: shell

-  pip install .
+  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install .

 For developing, install the package with development dependencies, and use an
 editable install:

 .. code-block:: shell

-  pip install -e ".[dev]"
+  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -e ".[dev]"

 Once the development dependencies are installed, you can build faster with:

 .. code-block:: shell

- python setup.py build_ext --inplace
+ CMAKE_BUILD_PARALLEL_LEVEL=8 python setup.py build_ext --inplace

 Run the tests with:

@@ -262,7 +262,7 @@ When building either the Python or C++ APIs make sure to pass the cmake flag

 .. code-block:: shell

-  CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"
+  CMAKE_BUILD_PARALLEL_LEVEL=8 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"

 To build the C++ package run:

--- a/mlx/backend/common/matmul.h
+++ b/mlx/backend/common/matmul.h
@@ -12,11 +12,16 @@ namespace mlx::core {
 inline std::tuple<Shape, Strides, Strides> collapse_batches(
    const array& a,
    const array& b) {
-  if (a.ndim() == 2) {
-    return {{1}, {0}, {0}};
+  // Get and check the shape for the batched dims
+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
+  Shape B_bshape{b.shape().begin(), b.shape().end() - 2};
+  if (A_bshape != B_bshape) {
+    std::ostringstream msg;
+    msg << "[matmul] Got matrices with incorrectly broadcasted shapes: " << "A "
+        << a.shape() << ", B " << b.shape() << ".";
+    throw std::runtime_error(msg.str());
  }

-  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};

@@ -37,11 +42,17 @@ inline std::tuple<Shape, Strides, Strides> collapse_batches(

 inline std::tuple<Shape, Strides, Strides, Strides>
 collapse_batches(const array& a, const array& b, const array& c) {
-  if (a.ndim() == 2) {
-    return {{1}, {0}, {0}, {0}};
+  // Get and check the shape for the batched dims
+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
+  Shape B_bshape{b.shape().begin(), b.shape().end() - 2};
+  Shape C_bshape{c.shape().begin(), c.shape().end() - 2};
+  if (A_bshape != B_bshape || A_bshape != C_bshape) {
+    std::ostringstream msg;
+    msg << "[addmm] Got matrices with incorrectly broadcasted shapes: " << "A "
+        << a.shape() << ", B " << b.shape() << ", B " << c.shape() << ".";
+    throw std::runtime_error(msg.str());
  }

-  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
  Strides C_bstride{c.strides().begin(), c.strides().end() - 2};
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -1,7 +1,6 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/fp16_math.cuh"
 #include "mlx/backend/cuda/iterators/strided_iterator.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/dtype_utils.h"
@@ -152,29 +151,30 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
-    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-    constexpr uint32_t N_READS = 4;
-    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
-      auto kernel =
-          cu::arg_reduce_general<T, cu::ArgMax<T>, block_dim(), N_READS>;
-      if (reduce_type_ == ArgReduce::ArgMin) {
-        kernel = cu::arg_reduce_general<T, cu::ArgMin<T>, block_dim(), N_READS>;
-      }
-      encoder.add_kernel_node(
-          kernel,
-          num_blocks,
-          block_dim(),
-          in.data<T>(),
-          out.data<uint32_t>(),
-          out.size(),
-          const_param(shape),
-          const_param(in_strides),
-          const_param(out_strides),
-          ndim,
-          axis_stride,
-          axis_size);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
+      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      constexpr uint32_t N_READS = 4;
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
+            auto kernel =
+                cu::arg_reduce_general<T, cu::ArgMax<T>, block_dim(), N_READS>;
+            if (reduce_type_ == ArgReduce::ArgMin) {
+              kernel = cu::
+                  arg_reduce_general<T, cu::ArgMin<T>, block_dim(), N_READS>;
+            }
+            kernel<<<num_blocks, block_dim(), 0, stream>>>(
+                in.data<T>(),
+                out.data<uint32_t>(),
+                out.size(),
+                const_param(shape),
+                const_param(in_strides),
+                const_param(out_strides),
+                ndim,
+                axis_stride,
+                axis_size);
+          });
    });
  });
 }
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -17,106 +17,35 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+template <typename Op, typename In, typename Out, typename IdxT>
 __global__ void binary_ss(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  int remaining = size - index * N_READS;
-  if (remaining <= 0) {
-    return;
-  }
-
-  if (remaining < N_READS) {
-    for (int i = 0; i < remaining; ++i) {
-      IdxT offset = index * N_READS + i;
-      out[offset] = Op{}(a[0], b[0]);
-    }
-  } else {
-    AlignedVector<Out, N_READS> out_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      out_vec.val[i] = Op{}(a[0], b[0]);
-    }
-
-    store_vector<N_READS>(out, index, out_vec);
+  if (index < size) {
+    out[index] = Op{}(a[0], b[0]);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+template <typename Op, typename In, typename Out, typename IdxT>
 __global__ void binary_sv(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  int remaining = size - index * N_READS;
-  if (remaining <= 0) {
-    return;
-  }
-
-  if (remaining < N_READS) {
-    for (int i = 0; i < remaining; ++i) {
-      IdxT offset = index * N_READS + i;
-      out[offset] = Op{}(a[0], b[offset]);
-    }
-  } else {
-    auto b_vec = load_vector<N_READS>(b, index);
-
-    AlignedVector<Out, N_READS> out_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      out_vec.val[i] = Op{}(a[0], b_vec.val[i]);
-    }
-
-    store_vector<N_READS>(out, index, out_vec);
+  if (index < size) {
+    out[index] = Op{}(a[0], b[index]);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+template <typename Op, typename In, typename Out, typename IdxT>
 __global__ void binary_vs(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  int remaining = size - index * N_READS;
-  if (remaining <= 0) {
-    return;
-  }
-
-  if (remaining < N_READS) {
-    for (int i = 0; i < remaining; ++i) {
-      IdxT offset = index * N_READS + i;
-      out[offset] = Op{}(a[offset], b[0]);
-    }
-  } else {
-    auto a_vec = load_vector<N_READS>(a, index);
-
-    AlignedVector<Out, N_READS> out_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      out_vec.val[i] = Op{}(a_vec.val[i], b[0]);
-    }
-
-    store_vector<N_READS>(out, index, out_vec);
+  if (index < size) {
+    out[index] = Op{}(a[index], b[0]);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+template <typename Op, typename In, typename Out, typename IdxT>
 __global__ void binary_vv(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  int remaining = size - index * N_READS;
-  if (remaining <= 0) {
-    return;
-  }
-
-  if (remaining < N_READS) {
-    for (int i = 0; i < remaining; ++i) {
-      IdxT offset = index * N_READS + i;
-      out[offset] = Op{}(a[offset], b[offset]);
-    }
-  } else {
-    auto a_vec = load_vector<N_READS>(a, index);
-    auto b_vec = load_vector<N_READS>(b, index);
-
-    AlignedVector<Out, N_READS> out_vec;
-#pragma unroll
-    for (int i = 0; i < N_READS; ++i) {
-      out_vec.val[i] = Op{}(a_vec.val[i], b_vec.val[i]);
-    }
-
-    store_vector<N_READS>(out, index, out_vec);
+  if (index < size) {
+    out[index] = Op{}(a[index], b[index]);
  }
 }

@@ -210,99 +139,90 @@ void binary_op_gpu_inplace(
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
-  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
-      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
-      if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
-        using InType = cuda_type_t<CTYPE_IN>;
-        using OutType = cuda_type_t<CTYPE_OUT>;
-        auto bopt = get_binary_op_type(a, b);
-        if (bopt == BinaryOpType::General) {
-          dispatch_bool(
-              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
-                  out.data_size() > INT32_MAX,
-              [&](auto large) {
-                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-                Shape shape;
-                std::vector<Strides> strides;
-                std::tie(shape, strides) = collapse_contiguous_dims(a, b, out);
-                auto& a_strides = strides[0];
-                auto& b_strides = strides[1];
-                int ndim = shape.size();
-                if (ndim <= 3) {
-                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                    auto kernel = cu::
-                        binary_g_nd<Op, InType, OutType, IdxT, dims_constant()>;
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+        using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+        using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+        if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+          using InType = cuda_type_t<CTYPE_IN>;
+          using OutType = cuda_type_t<CTYPE_OUT>;
+          auto bopt = get_binary_op_type(a, b);
+          if (bopt == BinaryOpType::General) {
+            dispatch_bool(
+                a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                    out.data_size() > INT32_MAX,
+                [&](auto large) {
+                  using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                  Shape shape;
+                  std::vector<Strides> strides;
+                  std::tie(shape, strides) =
+                      collapse_contiguous_dims(a, b, out);
+                  auto& a_strides = strides[0];
+                  auto& b_strides = strides[1];
+                  int ndim = shape.size();
+                  if (ndim <= 3) {
+                    dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                      auto kernel = cu::binary_g_nd<
+                          Op,
+                          InType,
+                          OutType,
+                          IdxT,
+                          dims_constant()>;
+                      auto [num_blocks, block_dims] =
+                          get_launch_args(kernel, out, large());
+                      kernel<<<num_blocks, block_dims, 0, stream>>>(
+                          a.data<InType>(),
+                          b.data<InType>(),
+                          out.data<OutType>(),
+                          out.size(),
+                          const_param<dims_constant()>(shape),
+                          const_param<dims_constant()>(a_strides),
+                          const_param<dims_constant()>(b_strides));
+                    });
+                  } else {
+                    auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
                    auto [num_blocks, block_dims] =
                        get_launch_args(kernel, out, large());
-                    encoder.add_kernel_node(
-                        kernel,
-                        num_blocks,
-                        block_dims,
+                    kernel<<<num_blocks, block_dims, 0, stream>>>(
                        a.data<InType>(),
                        b.data<InType>(),
                        out.data<OutType>(),
                        out.size(),
-                        const_param<dims_constant()>(shape),
-                        const_param<dims_constant()>(a_strides),
-                        const_param<dims_constant()>(b_strides));
-                  });
-                } else {
-                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out, large());
-                  encoder.add_kernel_node(
-                      kernel,
-                      num_blocks,
-                      block_dims,
-                      a.data<InType>(),
-                      b.data<InType>(),
-                      out.data<OutType>(),
-                      out.size(),
-                      const_param(shape),
-                      const_param(a_strides),
-                      const_param(b_strides),
-                      ndim);
-                }
-              });
+                        const_param(shape),
+                        const_param(a_strides),
+                        const_param(b_strides),
+                        ndim);
+                  }
+                });
+          } else {
+            dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+              using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+              auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
+              if (bopt == BinaryOpType::ScalarVector) {
+                kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
+              } else if (bopt == BinaryOpType::VectorScalar) {
+                kernel = cu::binary_vs<Op, InType, OutType, IdxT>;
+              } else if (bopt == BinaryOpType::VectorVector) {
+                kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
+              }
+              auto [num_blocks, block_dims] = get_launch_args(
+                  kernel, out.data_size(), out.shape(), out.strides(), large());
+              kernel<<<num_blocks, block_dims, 0, stream>>>(
+                  a.data<InType>(),
+                  b.data<InType>(),
+                  out.data<OutType>(),
+                  out.data_size());
+            });
+          }
        } else {
-          dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
-            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-            // TODO: Choose optimized value based on type size.
-            constexpr int N_READS = 4;
-            auto kernel = cu::binary_ss<Op, InType, OutType, IdxT, N_READS>;
-            if (bopt == BinaryOpType::ScalarVector) {
-              kernel = cu::binary_sv<Op, InType, OutType, IdxT, N_READS>;
-            } else if (bopt == BinaryOpType::VectorScalar) {
-              kernel = cu::binary_vs<Op, InType, OutType, IdxT, N_READS>;
-            } else if (bopt == BinaryOpType::VectorVector) {
-              kernel = cu::binary_vv<Op, InType, OutType, IdxT, N_READS>;
-            }
-            auto [num_blocks, block_dims] = get_launch_args(
-                kernel,
-                out.data_size(),
-                out.shape(),
-                out.strides(),
-                large(),
-                N_READS);
-            encoder.add_kernel_node(
-                kernel,
-                num_blocks,
-                block_dims,
-                a.data<InType>(),
-                b.data<InType>(),
-                out.data<OutType>(),
-                out.data_size());
-          });
+          throw std::runtime_error(fmt::format(
+              "Can not do binary op {} on inputs of {} with result of {}.",
+              op,
+              dtype_to_string(a.dtype()),
+              dtype_to_string(out.dtype())));
        }
-      } else {
-        throw std::runtime_error(fmt::format(
-            "Can not do binary op {} on inputs of {} with result of {}.",
-            op,
-            dtype_to_string(a.dtype()),
-            dtype_to_string(out.dtype())));
-      }
+      });
    });
  });
 }
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -137,101 +137,98 @@ void binary_op_gpu_inplace(
  encoder.set_input_array(b);
  encoder.set_output_array(out_a);
  encoder.set_output_array(out_b);
-  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out_a.dtype(), [&](auto out_type_tag) {
-      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
-      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
-      if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
-        using InType = cuda_type_t<CTYPE_IN>;
-        using OutType = cuda_type_t<CTYPE_OUT>;
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out_a.dtype(), [&](auto out_type_tag) {
+        using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+        using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+        if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+          using InType = cuda_type_t<CTYPE_IN>;
+          using OutType = cuda_type_t<CTYPE_OUT>;

-        auto bopt = get_binary_op_type(a, b);
-        if (bopt == BinaryOpType::General) {
-          dispatch_bool(
-              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
-                  out_a.data_size() > INT32_MAX,
-              [&](auto large) {
-                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-                Shape shape;
-                std::vector<Strides> strides;
-                std::tie(shape, strides) =
-                    collapse_contiguous_dims(a, b, out_a);
-                auto& a_strides = strides[0];
-                auto& b_strides = strides[1];
-                int ndim = shape.size();
-                if (ndim <= 3) {
-                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                    auto kernel = cu::
-                        binary_g_nd<Op, InType, OutType, IdxT, dims_constant()>;
+          auto bopt = get_binary_op_type(a, b);
+          if (bopt == BinaryOpType::General) {
+            dispatch_bool(
+                a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                    out_a.data_size() > INT32_MAX,
+                [&](auto large) {
+                  using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                  Shape shape;
+                  std::vector<Strides> strides;
+                  std::tie(shape, strides) =
+                      collapse_contiguous_dims(a, b, out_a);
+                  auto& a_strides = strides[0];
+                  auto& b_strides = strides[1];
+                  int ndim = shape.size();
+                  if (ndim <= 3) {
+                    dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                      auto kernel = cu::binary_g_nd<
+                          Op,
+                          InType,
+                          OutType,
+                          IdxT,
+                          dims_constant()>;
+                      auto [num_blocks, block_dims] =
+                          get_launch_args(kernel, out_a, large());
+                      kernel<<<num_blocks, block_dims, 0, stream>>>(
+                          a.data<InType>(),
+                          b.data<InType>(),
+                          out_a.data<OutType>(),
+                          out_b.data<OutType>(),
+                          out_a.size(),
+                          const_param<dims_constant()>(shape),
+                          const_param<dims_constant()>(a_strides),
+                          const_param<dims_constant()>(b_strides));
+                    });
+                  } else {
+                    auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
                    auto [num_blocks, block_dims] =
                        get_launch_args(kernel, out_a, large());
-                    encoder.add_kernel_node(
-                        kernel,
-                        num_blocks,
-                        block_dims,
+                    kernel<<<num_blocks, block_dims, 0, stream>>>(
                        a.data<InType>(),
                        b.data<InType>(),
                        out_a.data<OutType>(),
                        out_b.data<OutType>(),
                        out_a.size(),
-                        const_param<dims_constant()>(shape),
-                        const_param<dims_constant()>(a_strides),
-                        const_param<dims_constant()>(b_strides));
-                  });
-                } else {
-                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out_a, large());
-                  encoder.add_kernel_node(
-                      kernel,
-                      num_blocks,
-                      block_dims,
-                      a.data<InType>(),
-                      b.data<InType>(),
-                      out_a.data<OutType>(),
-                      out_b.data<OutType>(),
-                      out_a.size(),
-                      const_param(shape),
-                      const_param(a_strides),
-                      const_param(b_strides),
-                      ndim);
-                }
-              });
+                        const_param(shape),
+                        const_param(a_strides),
+                        const_param(b_strides),
+                        ndim);
+                  }
+                });
+          } else {
+            dispatch_bool(out_a.data_size() > INT32_MAX, [&](auto large) {
+              using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+              auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
+              if (bopt == BinaryOpType::ScalarVector) {
+                kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
+              } else if (bopt == BinaryOpType::VectorScalar) {
+                kernel = cu::binary_vs<Op, InType, OutType, IdxT>;
+              } else if (bopt == BinaryOpType::VectorVector) {
+                kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
+              }
+              auto [num_blocks, block_dims] = get_launch_args(
+                  kernel,
+                  out_a.data_size(),
+                  out_a.shape(),
+                  out_a.strides(),
+                  large());
+              kernel<<<num_blocks, block_dims, 0, stream>>>(
+                  a.data<InType>(),
+                  b.data<InType>(),
+                  out_a.data<OutType>(),
+                  out_b.data<OutType>(),
+                  out_a.data_size());
+            });
+          }
        } else {
-          dispatch_bool(out_a.data_size() > INT32_MAX, [&](auto large) {
-            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-            auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
-            if (bopt == BinaryOpType::ScalarVector) {
-              kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
-            } else if (bopt == BinaryOpType::VectorScalar) {
-              kernel = cu::binary_vs<Op, InType, OutType, IdxT>;
-            } else if (bopt == BinaryOpType::VectorVector) {
-              kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
-            }
-            auto [num_blocks, block_dims] = get_launch_args(
-                kernel,
-                out_a.data_size(),
-                out_a.shape(),
-                out_a.strides(),
-                large());
-            encoder.add_kernel_node(
-                kernel,
-                num_blocks,
-                block_dims,
-                a.data<InType>(),
-                b.data<InType>(),
-                out_a.data<OutType>(),
-                out_b.data<OutType>(),
-                out_a.data_size());
-          });
+          throw std::runtime_error(fmt::format(
+              "Can not do binary op {} on inputs of {} with result of {}.",
+              op,
+              dtype_to_string(a.dtype()),
+              dtype_to_string(out_a.dtype())));
        }
-      } else {
-        throw std::runtime_error(fmt::format(
-            "Can not do binary op {} on inputs of {} with result of {}.",
-            op,
-            dtype_to_string(a.dtype()),
-            dtype_to_string(out_a.dtype())));
-      }
+      });
    });
  });
 }
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -3,7 +3,6 @@
 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/jit_module.h"
-#include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/graph_utils.h"
 #include "mlx/primitives.h"

@@ -179,7 +178,6 @@ void Compiled::eval_gpu(
  // Whether to use large index.
  bool large = compiled_use_large_index(inputs, outputs, contiguous);

-  cu::KernelArgs args;
  // Put inputs.
  int strides_index = 1;
  for (size_t i = 0; i < inputs.size(); ++i) {
@@ -187,26 +185,26 @@ void Compiled::eval_gpu(
      continue;
    }
    const auto& x = inputs[i];
-    args.append(x);
+    mod.append_arg(x);
    if (!contiguous && !is_scalar(x)) {
-      args.append_ptr(strides_vec[strides_index++].data());
+      mod.append_arg(strides_vec[strides_index++]);
    }
  }

  // Put outputs.
  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);
  for (auto& x : outputs) {
-    args.append(x);
+    mod.append_arg(x);
  }

  // Put shape and size.
  if (!contiguous) {
-    args.append_ptr(shape.data());
+    mod.append_arg(shape);
  }
  if (large) {
-    args.append<int64_t>(outputs[0].data_size());
+    mod.append_arg<int64_t>(outputs[0].data_size());
  } else {
-    args.append<uint32_t>(outputs[0].data_size());
+    mod.append_arg<uint32_t>(outputs[0].data_size());
  }

  // Launch kernel.
@@ -224,10 +222,9 @@ void Compiled::eval_gpu(
  for (const auto& out : outputs) {
    encoder.set_output_array(out);
  }
-
-  auto kernel = mod.get_kernel(kernel_name);
-  auto [num_blocks, block_dims] = get_launch_args(kernel, outputs[0], large);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    mod.launch_kernel(stream, kernel_name, outputs[0], large);
+  });
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -35,25 +35,24 @@ void copy_contiguous(
    array& out,
    int64_t in_offset,
    int64_t out_offset) {
-  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
-        using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-        using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-        auto kernel = cu::copy_s<InType, OutType, IdxT>;
-        if (ctype == CopyType::Vector) {
-          kernel = cu::copy_v<InType, OutType, IdxT>;
-        }
-        auto [num_blocks, block_dims] = get_launch_args(
-            kernel, out.data_size(), out.shape(), out.strides(), large());
-        encoder.add_kernel_node(
-            kernel,
-            num_blocks,
-            block_dims,
-            in.data<InType>() + in_offset,
-            out.data<OutType>() + out_offset,
-            out.data_size());
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+        dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+          using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+          using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+          using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+          auto kernel = cu::copy_s<InType, OutType, IdxT>;
+          if (ctype == CopyType::Vector) {
+            kernel = cu::copy_v<InType, OutType, IdxT>;
+          }
+          auto [num_blocks, block_dims] = get_launch_args(
+              kernel, out.data_size(), out.shape(), out.strides(), large());
+          kernel<<<num_blocks, block_dims, 0, stream>>>(
+              in.data<InType>() + in_offset,
+              out.data<OutType>() + out_offset,
+              out.data_size());
+        });
      });
    });
  });
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -55,54 +55,50 @@ void copy_general(
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out) {
-  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      dispatch_bool(
-          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-          [&](auto large) {
-            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-            const InType* in_ptr = in.data<InType>() + offset_in;
-            OutType* out_ptr = out.data<OutType>() + offset_out;
-            int ndim = shape.size();
-            size_t data_size = 1;
-            for (auto& s : shape)
-              data_size *= s;
-            if (ndim <= 3) {
-              dispatch_1_2_3(ndim, [&](auto ndim_constant) {
-                auto kernel =
-                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>;
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+        dispatch_bool(
+            in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+            [&](auto large) {
+              using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+              using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+              const InType* in_ptr = in.data<InType>() + offset_in;
+              OutType* out_ptr = out.data<OutType>() + offset_out;
+              int ndim = shape.size();
+              size_t data_size = 1;
+              for (auto& s : shape)
+                data_size *= s;
+              if (ndim <= 3) {
+                dispatch_1_2_3(ndim, [&](auto ndim_constant) {
+                  auto kernel =
+                      cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>;
+                  auto [num_blocks, block_dims] = get_launch_args(
+                      kernel, data_size, shape, out.strides(), large());
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      in_ptr,
+                      out_ptr,
+                      data_size,
+                      const_param<ndim_constant()>(shape),
+                      const_param<ndim_constant()>(strides_in),
+                      const_param<ndim_constant()>(strides_out));
+                });
+              } else { // ndim >= 4
+                auto kernel = cu::copy_gg<InType, OutType, IdxT>;
                auto [num_blocks, block_dims] = get_launch_args(
                    kernel, data_size, shape, out.strides(), large());
-                encoder.add_kernel_node(
-                    kernel,
-                    num_blocks,
-                    block_dims,
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
                    in_ptr,
                    out_ptr,
                    data_size,
-                    const_param<ndim_constant()>(shape),
-                    const_param<ndim_constant()>(strides_in),
-                    const_param<ndim_constant()>(strides_out));
-              });
-            } else { // ndim >= 4
-              auto kernel = cu::copy_gg<InType, OutType, IdxT>;
-              auto [num_blocks, block_dims] = get_launch_args(
-                  kernel, data_size, shape, out.strides(), large());
-              encoder.add_kernel_node(
-                  kernel,
-                  num_blocks,
-                  block_dims,
-                  in_ptr,
-                  out_ptr,
-                  data_size,
-                  const_param(shape),
-                  const_param(strides_in),
-                  const_param(strides_out),
-                  ndim);
-            }
-          });
+                    const_param(shape),
+                    const_param(strides_in),
+                    const_param(strides_out),
+                    ndim);
+              }
+            });
+      });
    });
  });
 }
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -61,55 +61,54 @@ void copy_general_dynamic(
    const Strides& strides_out,
    const array& dynamic_offset_in,
    const array& dynamic_offset_out) {
-  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      dispatch_bool(
-          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-          [&](auto large) {
-            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-            const InType* in_ptr = in.data<InType>() + offset_in;
-            OutType* out_ptr = out.data<OutType>() + offset_out;
-            int ndim = shape.size();
-            if (ndim <= 3) {
-              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto kernel = cu::
-                    copy_gg_dynamic_nd<InType, OutType, IdxT, dims_constant()>;
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+        dispatch_bool(
+            in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+            [&](auto large) {
+              using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+              using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+              const InType* in_ptr = in.data<InType>() + offset_in;
+              OutType* out_ptr = out.data<OutType>() + offset_out;
+              int ndim = shape.size();
+              if (ndim <= 3) {
+                dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                  auto kernel = cu::copy_gg_dynamic_nd<
+                      InType,
+                      OutType,
+                      IdxT,
+                      dims_constant()>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out, large());
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      in_ptr,
+                      out_ptr,
+                      out.size(),
+                      const_param<dims_constant()>(shape),
+                      const_param<dims_constant()>(strides_in),
+                      const_param<dims_constant()>(strides_out),
+                      dynamic_offset_in.data<int64_t>(),
+                      dynamic_offset_out.data<int64_t>());
+                });
+              } else { // ndim >= 4
+                auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
                auto [num_blocks, block_dims] =
                    get_launch_args(kernel, out, large());
-                encoder.add_kernel_node(
-                    kernel,
-                    num_blocks,
-                    block_dims,
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
                    in_ptr,
                    out_ptr,
                    out.size(),
-                    const_param<dims_constant()>(shape),
-                    const_param<dims_constant()>(strides_in),
-                    const_param<dims_constant()>(strides_out),
+                    const_param(shape),
+                    const_param(strides_in),
+                    const_param(strides_out),
+                    ndim,
                    dynamic_offset_in.data<int64_t>(),
                    dynamic_offset_out.data<int64_t>());
-              });
-            } else { // ndim >= 4
-              auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large());
-              encoder.add_kernel_node(
-                  kernel,
-                  num_blocks,
-                  block_dims,
-                  in_ptr,
-                  out_ptr,
-                  out.size(),
-                  const_param(shape),
-                  const_param(strides_in),
-                  const_param(strides_out),
-                  ndim,
-                  dynamic_offset_in.data<int64_t>(),
-                  dynamic_offset_out.data<int64_t>());
-            }
-          });
+              }
+            });
+      });
    });
  });
 }
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -50,49 +50,45 @@ void copy_general_input(
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in) {
-  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      dispatch_bool(
-          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-          [&](auto large) {
-            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-            const InType* in_ptr = in.data<InType>() + offset_in;
-            OutType* out_ptr = out.data<OutType>() + offset_out;
-            int ndim = shape.size();
-            if (ndim <= 3) {
-              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto kernel =
-                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>;
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+        dispatch_bool(
+            in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+            [&](auto large) {
+              using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+              using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+              const InType* in_ptr = in.data<InType>() + offset_in;
+              OutType* out_ptr = out.data<OutType>() + offset_out;
+              int ndim = shape.size();
+              if (ndim <= 3) {
+                dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                  auto kernel =
+                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out, large());
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      in_ptr,
+                      out_ptr,
+                      out.size(),
+                      const_param<dims_constant()>(shape),
+                      const_param<dims_constant()>(strides_in));
+                });
+              } else { // ndim >= 4
+                auto kernel = cu::copy_g<InType, OutType, IdxT>;
                auto [num_blocks, block_dims] =
                    get_launch_args(kernel, out, large());
-                encoder.add_kernel_node(
-                    kernel,
-                    num_blocks,
-                    block_dims,
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
                    in_ptr,
                    out_ptr,
                    out.size(),
-                    const_param<dims_constant()>(shape),
-                    const_param<dims_constant()>(strides_in));
-              });
-            } else { // ndim >= 4
-              auto kernel = cu::copy_g<InType, OutType, IdxT>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large());
-              encoder.add_kernel_node(
-                  kernel,
-                  num_blocks,
-                  block_dims,
-                  in_ptr,
-                  out_ptr,
-                  out.size(),
-                  const_param(shape),
-                  const_param(strides_in),
-                  ndim);
-            }
-          });
+                    const_param(shape),
+                    const_param(strides_in),
+                    ndim);
+              }
+            });
+      });
    });
  });
 }
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -2,27 +2,37 @@

 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/worker.h"
-#include "mlx/utils.h"
+#include "mlx/backend/metal/metal.h"

 #include <fmt/format.h>
 #include <nvtx3/nvtx3.hpp>
 #include <future>
-#include <unordered_set>

 namespace mlx::core {

-// Can be tuned with MLX_MAX_OPS_PER_BUFFER
-// This should be less than 255
-constexpr int default_max_nodes_per_graph = 20;
+namespace cu {

-int cuda_graph_cache_size() {
-  static int cache_size = []() {
-    return env::get_var("MLX_CUDA_GRAPH_CACHE_SIZE", 100);
-  }();
-  return cache_size;
+DeviceStream::DeviceStream(Device& device) : device_(device), stream_(device) {}
+
+void DeviceStream::synchronize() {
+  cudaStreamSynchronize(stream_);
 }

-namespace cu {
+cudaStream_t DeviceStream::schedule_cuda_stream() {
+  // TODO: Return a stream that maximizes parallelism.
+  return stream_;
+}
+
+cudaStream_t DeviceStream::last_cuda_stream() {
+  return stream_;
+}
+
+CommandEncoder& DeviceStream::get_encoder() {
+  if (!encoder_) {
+    encoder_ = std::make_unique<CommandEncoder>(*this);
+  }
+  return *encoder_;
+}

 Device::Device(int device) : device_(device) {
  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
@@ -57,261 +67,49 @@ void Device::make_current() {
  }
 }

-CommandEncoder::CaptureContext::CaptureContext(CommandEncoder& enc) : enc(enc) {
-  CHECK_CUDA_ERROR(cudaGraphCreate(&graph, 0));
-  CHECK_CUDA_ERROR(
-      cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeGlobal));
-}
-
-CommandEncoder::CaptureContext::~CaptureContext() {
-  CHECK_CUDA_ERROR(cudaStreamEndCapture(enc.stream(), &graph));
-  size_t num_nodes;
-  CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, NULL, &num_nodes));
-  if (num_nodes == 1) {
-    cudaGraphNode_t captured_node;
-    CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, &captured_node, &num_nodes));
-    CUDA_KERNEL_NODE_PARAMS params;
-    CHECK_CUDA_ERROR(cuGraphKernelNodeGetParams(captured_node, &params));
-    cudaGraphNode_t node;
-    CHECK_CUDA_ERROR(cuGraphAddKernelNode(&node, enc.graph_, NULL, 0, &params));
-    enc.insert_graph_dependencies(GraphNode{node, 'K'});
-  } else {
-    cudaGraphNode_t node;
-    CHECK_CUDA_ERROR(
-        cudaGraphAddChildGraphNode(&node, enc.graph_, NULL, 0, graph));
-    enc.insert_graph_dependencies(GraphNode{node, 'G'});
-  }
-  CHECK_CUDA_ERROR(cudaGraphDestroy(graph));
-}
-
-CommandEncoder::ConcurrentContext::ConcurrentContext(CommandEncoder& enc)
-    : enc(enc) {
-  enc.in_concurrent_ = true;
-}
-
-CommandEncoder::ConcurrentContext::~ConcurrentContext() {
-  enc.in_concurrent_ = false;
-
-  // Use an empty graph node for synchronization
-  CommandEncoder::GraphNode empty{NULL, 'E', std::to_string(enc.node_count_++)};
-  enc.empty_node_count_++;
-  CHECK_CUDA_ERROR(cudaGraphAddEmptyNode(&empty.node, enc.graph_, NULL, 0));
-
-  // Insert the concurrent -> empty node dependencies
-  for (auto& from : enc.concurrent_nodes_) {
-    enc.from_nodes_.push_back(from.node);
-    enc.to_nodes_.push_back(empty.node);
-    enc.graph_key_ += from.id;
-    enc.graph_key_ += from.node_type;
-    enc.graph_key_ += empty.id;
-    enc.graph_key_ += empty.node_type;
-  }
-
-  // Insert the input -> concurrent node dependencies without updating output
-  // nodes
-  auto outputs = std::move(enc.active_outputs_);
-  enc.insert_graph_dependencies(std::move(enc.concurrent_nodes_));
-
-  // Update output node to be the empty node
-  for (auto o : outputs) {
-    enc.node_map_.emplace(o, empty).first->second = empty;
-  }
-}
-
-void CommandEncoder::insert_graph_dependencies(GraphNode node) {
-  if (node.node_type == 'G') {
-    graph_node_count_++;
-  }
-  node.id = std::to_string(node_count_++);
-  if (in_concurrent_) {
-    concurrent_nodes_.push_back(std::move(node));
-  } else {
-    std::vector<GraphNode> nodes;
-    nodes.push_back(std::move(node));
-    insert_graph_dependencies(std::move(nodes));
-  }
-}
-
-void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
-  std::vector<GraphNode> deps;
-  {
-    // Dependencies must be added in the same order to produce a consistent
-    // topology
-    std::unordered_set<cudaGraphNode_t> set_deps;
-    for (auto d : active_deps_) {
-      if (auto it = node_map_.find(d); it != node_map_.end()) {
-        auto [_, inserted] = set_deps.insert(it->second.node);
-        if (inserted) {
-          deps.push_back(it->second);
-        }
-      }
-    }
-  }
-  active_deps_.clear();
-
-  for (auto o : active_outputs_) {
-    for (auto& node : nodes) {
-      node_map_.emplace(o, node).first->second = node;
-    }
-  }
-  active_outputs_.clear();
-
-  for (auto& from : deps) {
-    for (auto& to : nodes) {
-      from_nodes_.push_back(from.node);
-      to_nodes_.push_back(to.node);
-      graph_key_ += from.id;
-      graph_key_ += from.node_type;
-      graph_key_ += to.id;
-      graph_key_ += to.node_type;
-    }
-  }
-}
-
-CommandEncoder& Device::get_command_encoder(Stream s) {
-  auto it = encoders_.find(s.index);
-  if (it == encoders_.end()) {
-    it = encoders_.try_emplace(s.index, *this).first;
+DeviceStream& Device::get_stream(Stream s) {
+  auto it = streams_.find(s.index);
+  if (it == streams_.end()) {
+    it = streams_.try_emplace(s.index, *this).first;
  }
  return it->second;
 }

-CommandEncoder::CommandEncoder(Device& d) : stream_(d) {
-  CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
-}
-
-void clear_graphs(std::unordered_map<std::string, cudaGraphExec_t>& graphs) {
-  for (auto& [_, graph_exec] : graphs) {
-    CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
-  }
-  graphs.clear();
-}
-
-CommandEncoder::~CommandEncoder() {
-  clear_graphs(graph_cache_);
-}
+CommandEncoder::CommandEncoder(DeviceStream& s)
+    : device_(s.device()), stream_(s) {}

 void CommandEncoder::add_completed_handler(std::function<void()> task) {
  worker_.add_task(std::move(task));
 }

-void CommandEncoder::set_input_array(const array& arr) {
-  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
-  active_deps_.push_back(id);
-}
+void CommandEncoder::end_encoding() {
+  if (!temporaries_.empty()) {
+    add_completed_handler([temporaries = std::move(temporaries_)]() {});
+  }

-void CommandEncoder::set_output_array(const array& arr) {
-  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
-  active_deps_.push_back(id);
-  active_outputs_.push_back(id);
-}
+  // There is no kernel running, run completion handlers immediately.
+  if (!has_gpu_work_) {
+    worker_.consume_in_this_thread();
+    return;
+  }
+  has_gpu_work_ = false;

-void CommandEncoder::maybe_commit() {
-  if (node_count_ >= env::max_ops_per_buffer(default_max_nodes_per_graph)) {
+  // Put completion handlers in a batch.
+  worker_.end_batch();
+
+  // Signaling kernel completion is expensive, delay until enough batches.
+  // TODO: This number is arbitrarily picked, profile for a better stragety.
+  if (worker_.uncommited_batches() > 8) {
    commit();
  }
 }

-void CommandEncoder::add_kernel_node(
-    void* func,
-    dim3 grid_dim,
-    dim3 block_dim,
-    void** params) {
-  cudaKernelNodeParams kernel_params = {0};
-  kernel_params.func = func;
-  kernel_params.gridDim = grid_dim;
-  kernel_params.blockDim = block_dim;
-  kernel_params.kernelParams = params;
-  cudaGraphNode_t node;
-  CHECK_CUDA_ERROR(
-      cudaGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
-  insert_graph_dependencies(GraphNode{node, 'K'});
-}
-
-void CommandEncoder::add_kernel_node(
-    CUfunction func,
-    dim3 grid_dim,
-    dim3 block_dim,
-    void** params) {
-  CUDA_KERNEL_NODE_PARAMS kernel_params = {0};
-  kernel_params.func = func;
-  kernel_params.gridDimX = grid_dim.x;
-  kernel_params.gridDimY = grid_dim.y;
-  kernel_params.gridDimZ = grid_dim.z;
-  kernel_params.blockDimX = block_dim.x;
-  kernel_params.blockDimY = block_dim.y;
-  kernel_params.blockDimZ = block_dim.z;
-  kernel_params.kernelParams = params;
-  CUgraphNode node;
-  CHECK_CUDA_ERROR(
-      cuGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
-  insert_graph_dependencies(GraphNode{node, 'K'});
-}
-
 void CommandEncoder::commit() {
-  if (!temporaries_.empty()) {
-    add_completed_handler([temporaries = std::move(temporaries_)]() {});
-  }
-  if (node_count_ > 0) {
-    if (!from_nodes_.empty()) {
-      CHECK_CUDA_ERROR(cudaGraphAddDependencies(
-          graph_, from_nodes_.data(), to_nodes_.data(), from_nodes_.size()));
-    }
-
-    graph_key_ += ".";
-    graph_key_ += std::to_string(node_count_);
-    graph_key_ += ".";
-    graph_key_ += std::to_string(graph_node_count_);
-    graph_key_ += ".";
-    graph_key_ += std::to_string(empty_node_count_);
-
-    cudaGraphExec_t& graph_exec = graph_cache_[graph_key_];
-
-    if (graph_exec != nullptr) {
-      cudaGraphExecUpdateResult update_result;
-#if CUDART_VERSION >= 12000
-      cudaGraphExecUpdateResultInfo info;
-      cudaGraphExecUpdate(graph_exec, graph_, &info);
-      update_result = info.result;
-#else
-      cudaGraphNode_t error_node;
-      cudaGraphExecUpdate(graph_exec, graph_, &error_node, &update_result);
-#endif // CUDART_VERSION >= 12000
-      if (update_result != cudaGraphExecUpdateSuccess) {
-        cudaGetLastError(); // reset error
-        CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
-        graph_exec = nullptr;
-      }
-    }
-    if (graph_exec == nullptr) {
-      CHECK_CUDA_ERROR(
-          cudaGraphInstantiate(&graph_exec, graph_, NULL, NULL, 0));
-    }
-    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
-
-    // TODO smarter cache policy
-    if (graph_cache_.size() > cuda_graph_cache_size()) {
-      clear_graphs(graph_cache_);
-    }
-
-    // Reset state
-    node_count_ = 0;
-    graph_node_count_ = 0;
-    from_nodes_.clear();
-    to_nodes_.clear();
-    graph_key_.clear();
-    node_map_.clear();
-    CHECK_CUDA_ERROR(cudaGraphDestroy(graph_));
-    CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
-  }
-
-  // Put completion handlers in a batch.
-  worker_.end_batch();
-  worker_.commit(stream_);
+  worker_.commit(stream_.last_cuda_stream());
 }

 void CommandEncoder::synchronize() {
-  cudaStreamSynchronize(stream_);
+  stream().synchronize();
  auto p = std::make_shared<std::promise<void>>();
  std::future<void> f = p->get_future();
  add_completed_handler([p = std::move(p)]() { p->set_value(); });
@@ -329,8 +127,12 @@ Device& device(mlx::core::Device device) {
  return it->second;
 }

+DeviceStream& get_stream(Stream s) {
+  return device(s.device).get_stream(s);
+}
+
 CommandEncoder& get_command_encoder(Stream s) {
-  return device(s.device).get_command_encoder(s);
+  return get_stream(s).get_encoder();
 }

 } // namespace cu
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@@ -7,108 +7,41 @@
 #include "mlx/stream.h"

 #include <cublasLt.h>
-#include <cuda.h>
 #include <thrust/execution_policy.h>

 #include <unordered_map>

 namespace mlx::core::cu {

-class CommandEncoder {
+class Device;
+class CommandEncoder;
+
+class DeviceStream {
 public:
-  struct CaptureContext {
-    CaptureContext(CommandEncoder& enc);
-    ~CaptureContext();
-    cudaGraph_t graph;
-    CommandEncoder& enc;
-  };
-  struct ConcurrentContext {
-    ConcurrentContext(CommandEncoder& enc);
-    ~ConcurrentContext();
-    CommandEncoder& enc;
-  };
+  explicit DeviceStream(Device& device);

-  explicit CommandEncoder(Device& d);
-  ~CommandEncoder();
+  DeviceStream(const DeviceStream&) = delete;
+  DeviceStream& operator=(const DeviceStream&) = delete;

-  CommandEncoder(const CommandEncoder&) = delete;
-  CommandEncoder& operator=(const CommandEncoder&) = delete;
-
-  CaptureContext capture_context() {
-    return CaptureContext{*this};
-  }
-  ConcurrentContext concurrent_context() {
-    return ConcurrentContext{*this};
-  }
-
-  void set_input_array(const array& arr);
-  void set_output_array(const array& arr);
-
-  template <typename F, typename... Params>
-  void
-  add_kernel_node(F* func, dim3 grid_dim, dim3 block_dim, Params&&... params) {
-    constexpr size_t num = sizeof...(Params);
-    void* ptrs[num];
-    size_t i = 0;
-    ([&](auto&& p) { ptrs[i++] = static_cast<void*>(&p); }(
-         std::forward<Params>(params)),
-     ...);
-    add_kernel_node((void*)func, grid_dim, block_dim, ptrs);
-  }
-
-  void add_kernel_node(
-      CUfunction func,
-      dim3 grid_dim,
-      dim3 block_dim,
-      void** params);
-
-  void
-  add_kernel_node(void* func, dim3 grid_dim, dim3 block_dim, void** params);
-
-  void add_temporary(const array& arr) {
-    temporaries_.push_back(arr.data_shared_ptr());
-  }
-
-  void add_completed_handler(std::function<void()> task);
-  void maybe_commit();
-  void commit();
-
-  CudaStream& stream() {
-    return stream_;
-  }
-
-  // Wait until kernels and completion handlers are finished
+  // Wait until kernels in the stream complete.
  void synchronize();

+  // Return a cuda stream for launching kernels.
+  cudaStream_t schedule_cuda_stream();
+
+  // Return the last cuda stream used.
+  cudaStream_t last_cuda_stream();
+
+  CommandEncoder& get_encoder();
+
+  Device& device() {
+    return device_;
+  }
+
 private:
-  struct GraphNode {
-    cudaGraphNode_t node;
-    // K = kernel
-    // E = empty
-    // G = subgraph
-    char node_type;
-    std::string id;
-  };
-
-  void insert_graph_dependencies(GraphNode node);
-  void insert_graph_dependencies(std::vector<GraphNode> nodes);
-
+  Device& device_;
  CudaStream stream_;
-  cudaGraph_t graph_;
-  Worker worker_;
-  char node_count_{0};
-  char graph_node_count_{0};
-  char empty_node_count_{0};
-  bool in_concurrent_{false};
-  std::vector<cudaGraphNode_t> from_nodes_;
-  std::vector<cudaGraphNode_t> to_nodes_;
-  std::string graph_key_;
-  std::vector<GraphNode> concurrent_nodes_;
-  std::vector<std::shared_ptr<array::Data>> temporaries_;
-  std::unordered_map<std::string, cudaGraphExec_t> graph_cache_;
-  std::vector<std::uintptr_t> active_deps_;
-  std::vector<std::uintptr_t> active_outputs_;
-  std::unordered_map<std::uintptr_t, GraphNode> node_map_;
+  std::unique_ptr<CommandEncoder> encoder_;
 };

 class Device {
@@ -122,7 +55,7 @@ class Device {
  // Make this device the current cuda device, required by some cuda calls.
  void make_current();

-  CommandEncoder& get_command_encoder(Stream s);
+  DeviceStream& get_stream(Stream s);

  int cuda_device() const {
    return device_;
@@ -142,10 +75,67 @@ class Device {
  int compute_capability_major_;
  int compute_capability_minor_;
  cublasLtHandle_t lt_;
-  std::unordered_map<int, CommandEncoder> encoders_;
+  std::unordered_map<int, DeviceStream> streams_;
+};
+
+class CommandEncoder {
+ public:
+  explicit CommandEncoder(DeviceStream& stream);
+
+  CommandEncoder(const CommandEncoder&) = delete;
+  CommandEncoder& operator=(const CommandEncoder&) = delete;
+
+  void set_input_array(const array& arr) {}
+  void set_output_array(const array& arr) {}
+
+  void add_temporary(const array& arr) {
+    temporaries_.push_back(arr.data_shared_ptr());
+  }
+
+  void add_completed_handler(std::function<void()> task);
+  void end_encoding();
+  void commit();
+
+  // Schedule a cuda stream for |fun| to launch kernels, and check error
+  // afterwards.
+  template <typename F>
+  void launch_kernel(F&& fun) {
+    launch_kernel(stream_.schedule_cuda_stream(), std::forward<F>(fun));
+  }
+
+  template <typename F>
+  void launch_kernel(cudaStream_t stream, F&& fun) {
+    device_.make_current();
+    fun(stream);
+    check_cuda_error("kernel launch", cudaGetLastError());
+    has_gpu_work_ = true;
+  }
+
+  Device& device() {
+    return device_;
+  }
+
+  DeviceStream& stream() {
+    return stream_;
+  }
+
+  bool has_gpu_work() const {
+    return has_gpu_work_;
+  }
+
+  // Wait until kernels and completion handlers are finished
+  void synchronize();
+
+ private:
+  Device& device_;
+  DeviceStream& stream_;
+  Worker worker_;
+  bool has_gpu_work_{false};
+  std::vector<std::shared_ptr<array::Data>> temporaries_;
 };

 Device& device(mlx::core::Device device);
+DeviceStream& get_stream(Stream s);
 CommandEncoder& get_command_encoder(Stream s);

 // Return an execution policy that does not sync for result.
--- a/mlx/backend/cuda/device/cast_op.cuh
+++ b/mlx/backend/cuda/device/cast_op.cuh
@@ -3,8 +3,6 @@
 #pragma once

 #include <cuComplex.h>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
 #include <thrust/iterator/transform_iterator.h>

 namespace mlx::core::cu {
@@ -19,26 +17,6 @@ struct CastOp {
  }
 };

-// Castings between complex and boolean.
-// TODO: Should make a custom complex type.
-template <>
-struct CastOp<cuComplex, bool> {
-  static constexpr bool is_castable = true;
-
-  __device__ bool operator()(cuComplex x) {
-    return x.x != 0 && x.y != 0;
-  }
-};
-
-template <>
-struct CastOp<bool, cuComplex> {
-  static constexpr bool is_castable = true;
-
-  __device__ cuComplex operator()(bool x) {
-    return x ? make_cuFloatComplex(1, 1) : make_cuFloatComplex(0, 0);
-  }
-};
-
 // Converting a complex number to real number discards the imaginary part.
 template <typename DstT>
 struct CastOp<
@@ -67,7 +45,6 @@ struct CastOp<
  }
 };

-// Do nothing when no casting is needed.
 template <typename SrcT, typename DstT>
 struct CastOp<
    SrcT,
@@ -80,53 +57,9 @@ struct CastOp<
  }
 };

-// In CUDA 11 the half types do not define conversions between some types,
-// provide fallbacks here.
-#if CUDART_VERSION < 12000
-template <typename SrcT, typename DstT>
-struct CastOp<
-    SrcT,
-    DstT,
-    cuda::std::enable_if_t<
-        !cuda::std::is_convertible_v<SrcT, DstT> &&
-        !cuda::std::is_same_v<SrcT, cuComplex> &&
-        (cuda::std::is_same_v<DstT, __half> ||
-         cuda::std::is_same_v<DstT, __nv_bfloat16>)>> {
-  static constexpr bool is_castable = true;
-
-  __device__ DstT operator()(SrcT x) {
-    return DstT(static_cast<float>(x));
-  }
-};
-
-template <typename SrcT, typename DstT>
-struct CastOp<
-    SrcT,
-    DstT,
-    cuda::std::enable_if_t<
-        !cuda::std::is_convertible_v<SrcT, DstT> &&
-        !cuda::std::is_same_v<DstT, cuComplex> &&
-        !cuda::std::is_same_v<DstT, __half> &&
-        !cuda::std::is_same_v<DstT, __nv_bfloat16> &&
-        (cuda::std::is_same_v<SrcT, __half> ||
-         cuda::std::is_same_v<SrcT, __nv_bfloat16>)>> {
-  static constexpr bool is_castable = true;
-
-  __device__ DstT operator()(SrcT x) {
-    return DstT(static_cast<float>(x));
-  }
-};
-#endif // CUDART_VERSION < 12000
-
-// Helper to deduce the SrcT.
-template <typename DstT, typename SrcT>
-inline __host__ __device__ auto cast_to(SrcT x) {
-  return CastOp<SrcT, DstT>{}(x);
-}
-
 // Return an iterator that cast the value to DstT using CastOp.
 template <typename DstT, typename Iterator>
-inline __host__ __device__ auto make_cast_iterator(Iterator it) {
+__host__ __device__ auto make_cast_iterator(Iterator it) {
  using SrcT = typename cuda::std::iterator_traits<Iterator>::value_type;
  if constexpr (std::is_same_v<SrcT, DstT>) {
    return it;
--- a/mlx/backend/cuda/device/utils.cuh
+++ b/mlx/backend/cuda/device/utils.cuh
@@ -28,27 +28,6 @@ namespace mlx::core::cu {
 using Shape = cuda::std::array<int32_t, MAX_NDIM>;
 using Strides = cuda::std::array<int64_t, MAX_NDIM>;

-// Vectorized load/store.
-template <typename T, int N>
-struct alignas(sizeof(T) * N) AlignedVector {
-  T val[N];
-};
-
-template <int N, typename T>
-inline __device__ AlignedVector<T, N> load_vector(
-    const T* ptr,
-    uint32_t offset) {
-  auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
-  return from[offset];
-}
-
-template <int N, typename T>
-inline __device__ void
-store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
-  auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
-  to[offset] = vec;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // Type limits utils
 ///////////////////////////////////////////////////////////////////////////////
@@ -99,20 +78,20 @@ struct Limits<
    return cuda::std::numeric_limits<T>::infinity();
  }
  static constexpr __host__ __device__ T min() {
-#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
-    return -cuda::std::numeric_limits<float>::infinity();
-#else
+#if defined(__CUDA_ARCH__) || CUDART_VERSION >= 12000
    return -cuda::std::numeric_limits<T>::infinity();
+#else
+    return -cuda::std::numeric_limits<float>::infinity();
 #endif
  }
  static constexpr __host__ __device__ T finite_max() {
    return cuda::std::numeric_limits<T>::max();
  }
  static constexpr __host__ __device__ T finite_min() {
-#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
-    return cuda::std::numeric_limits<float>::lowest();
-#else
+#if defined(__CUDA_ARCH__) || CUDART_VERSION >= 12000
    return cuda::std::numeric_limits<T>::lowest();
+#else
+    return cuda::std::numeric_limits<float>::lowest();
 #endif
  }
 };
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@@ -37,20 +37,22 @@ void eval(array& arr) {
  }

  auto& encoder = cu::get_command_encoder(arr.primitive().stream());
-  // Keep used buffers alive until kernel finishes running.
-  std::unordered_set<std::shared_ptr<array::Data>> buffers;
-  for (auto& in : arr.inputs()) {
-    buffers.insert(in.data_shared_ptr());
+  if (encoder.has_gpu_work()) {
+    // Keep used buffers alive until kernel finishes running.
+    std::unordered_set<std::shared_ptr<array::Data>> buffers;
+    for (auto& in : arr.inputs()) {
+      buffers.insert(in.data_shared_ptr());
+    }
+    for (auto& s : arr.siblings()) {
+      buffers.insert(s.data_shared_ptr());
+    }
+    // Remove the output if it was donated to by an input.
+    if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
+      buffers.erase(it);
+    }
+    encoder.add_completed_handler([buffers = std::move(buffers)]() {});
  }
-  for (auto& s : arr.siblings()) {
-    buffers.insert(s.data_shared_ptr());
-  }
-  // Remove the output if it was donated to by an input.
-  if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
-    buffers.erase(it);
-  }
-  encoder.add_completed_handler([buffers = std::move(buffers)]() {});
-  encoder.maybe_commit();
+  encoder.end_encoding();
 }

 void finalize(Stream s) {
--- a/mlx/backend/cuda/event.cu
+++ b/mlx/backend/cuda/event.cu
@@ -61,9 +61,7 @@ void CudaEvent::wait(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    scheduler::enqueue(s, [*this]() mutable { wait(); });
  } else {
-    auto& enc = cu::get_command_encoder(s);
-    enc.commit();
-    wait(enc.stream());
+    wait(cu::get_stream(s).last_cuda_stream());
  }
 }

@@ -76,9 +74,7 @@ void CudaEvent::record(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    throw std::runtime_error("CudaEvent can not wait on cpu stream.");
  } else {
-    auto& enc = cu::get_command_encoder(s);
-    enc.commit();
-    record(enc.stream());
+    record(cu::get_stream(s).last_cuda_stream());
  }
 }

@@ -140,9 +136,11 @@ void SharedEvent::wait(Stream s, uint64_t value) {
    scheduler::enqueue(s, [*this, value]() mutable { wait(value); });
  } else {
    auto& encoder = get_command_encoder(s);
-    encoder.commit();
-    wait(encoder.stream(), value);
+    encoder.launch_kernel(
+        encoder.stream().last_cuda_stream(),
+        [this, value](cudaStream_t stream) { wait(stream, value); });
    encoder.add_completed_handler([ac = ac_]() {});
+    encoder.end_encoding();
  }
 }

@@ -164,9 +162,11 @@ void SharedEvent::signal(Stream s, uint64_t value) {
    scheduler::enqueue(s, [*this, value]() mutable { signal(stream, value); });
  } else {
    auto& encoder = get_command_encoder(s);
-    encoder.commit();
-    signal(encoder.stream(), value);
+    encoder.launch_kernel(
+        encoder.stream().last_cuda_stream(),
+        [this, value](cudaStream_t stream) { signal(stream, value); });
    encoder.add_completed_handler([ac = ac_]() {});
+    encoder.end_encoding();
  }
 }

--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@@ -3,16 +3,13 @@
 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/jit_module.h"
-#include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"

 #include "cuda_jit_sources.h"

-#include <cuda.h>
 #include <fmt/format.h>
-#include <nvrtc.h>
 #include <nvtx3/nvtx3.hpp>

 #include <cassert>
@@ -25,7 +22,7 @@ namespace {
 constexpr const char* g_scatter_ops[] = {"Max", "Min", "Sum", "Prod", "Assign"};

 void append_indices_arg(
-    cu::KernelArgs& args,
+    cu::JitModule& mod,
    const std::vector<array>& inputs,
    int nidx,
    int idx_ndim) {
@@ -33,7 +30,7 @@ void append_indices_arg(
  for (int i = 0; i < nidx; ++i) {
    indices[i] = inputs[i + 1].data<void>();
  }
-  args.append(std::move(indices));
+  mod.append_arg(std::move(indices));
  std::vector<int32_t> indices_shape(nidx * idx_ndim);
  for (int i = 0; i < nidx; ++i) {
    std::copy_n(
@@ -41,7 +38,7 @@ void append_indices_arg(
        idx_ndim,
        indices_shape.data() + i * idx_ndim);
  }
-  args.append(std::move(indices_shape));
+  mod.append_arg(std::move(indices_shape));
  std::vector<int64_t> indices_strides(nidx * idx_ndim);
  for (int i = 0; i < nidx; ++i) {
    std::copy_n(
@@ -49,7 +46,7 @@ void append_indices_arg(
        idx_ndim,
        indices_strides.data() + i * idx_ndim);
  }
-  args.append(std::move(indices_strides));
+  mod.append_arg(std::move(indices_strides));
 }

 } // namespace
@@ -97,21 +94,20 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
    return std::make_pair(jit_source_gather, std::move(kernel_names));
  });

-  cu::KernelArgs args;
-  args.append(src);
-  args.append(out);
+  mod.append_arg(src);
+  mod.append_arg(out);
  if (large) {
-    args.append<int64_t>(out.size());
+    mod.append_arg<int64_t>(out.size());
  } else {
-    args.append<int32_t>(out.size());
+    mod.append_arg<int32_t>(out.size());
  }
-  args.append_ndim(src.shape());
-  args.append_ndim(src.strides());
-  args.append<int32_t>(src.ndim());
-  args.append_ndim(slice_sizes_);
-  args.append(slice_size);
-  args.append(axes_);
-  append_indices_arg(args, inputs, nidx, idx_ndim);
+  mod.append_ndim_arg(src.shape());
+  mod.append_ndim_arg(src.strides());
+  mod.append_arg<int32_t>(src.ndim());
+  mod.append_ndim_arg(slice_sizes_);
+  mod.append_arg(slice_size);
+  mod.append_arg(axes_);
+  append_indices_arg(mod, inputs, nidx, idx_ndim);

  std::string kernel_name = fmt::format(
      "mlx::core::cu::gather<{}, {}, {}, {}, {}>",
@@ -126,10 +122,9 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-
-  auto kernel = mod.get_kernel(kernel_name);
-  auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    mod.launch_kernel(stream, kernel_name, out, large);
+  });
 }

 void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -192,27 +187,26 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
    return std::make_pair(jit_source_scatter, std::move(kernel_names));
  });

-  cu::KernelArgs args;
-  args.append(upd);
-  args.append(out);
+  mod.append_arg(upd);
+  mod.append_arg(out);
  if (large) {
-    args.append<int64_t>(upd.size());
+    mod.append_arg<int64_t>(upd.size());
  } else {
-    args.append<int32_t>(upd.size());
+    mod.append_arg<int32_t>(upd.size());
  }
-  args.append_ndim(upd.shape());
-  args.append_ndim(upd.strides());
-  args.append<int32_t>(upd.ndim());
+  mod.append_ndim_arg(upd.shape());
+  mod.append_ndim_arg(upd.strides());
+  mod.append_arg<int32_t>(upd.ndim());
  if (large) {
-    args.append<int64_t>(upd_post_idx_size);
+    mod.append_arg<int64_t>(upd_post_idx_size);
  } else {
-    args.append<int32_t>(upd_post_idx_size);
+    mod.append_arg<int32_t>(upd_post_idx_size);
  }
-  args.append_ndim(out.shape());
-  args.append_ndim(out.strides());
-  args.append<int32_t>(out.ndim());
-  args.append(axes_);
-  append_indices_arg(args, inputs, nidx, idx_ndim);
+  mod.append_ndim_arg(out.shape());
+  mod.append_ndim_arg(out.strides());
+  mod.append_arg<int32_t>(out.ndim());
+  mod.append_arg(axes_);
+  append_indices_arg(mod, inputs, nidx, idx_ndim);

  std::string kernel_name = fmt::format(
      "mlx::core::cu::scatter<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}>",
@@ -228,9 +222,9 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  auto kernel = mod.get_kernel(kernel_name);
-  auto [num_blocks, block_dims] = get_launch_args(kernel, upd, large);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    mod.launch_kernel(stream, kernel_name, upd, large);
+  });
 }

 void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -281,26 +275,25 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  size_t idx_size_axis = idx.shape(axis_);

-  cu::KernelArgs args;
-  args.append(src);
-  args.append(idx);
-  args.append(out);
+  mod.append_arg(src);
+  mod.append_arg(idx);
+  mod.append_arg(out);
  if (large) {
-    args.append<int64_t>(idx_size_pre);
-    args.append<int64_t>(idx_size_axis);
-    args.append<int64_t>(idx_size_post);
+    mod.append_arg<int64_t>(idx_size_pre);
+    mod.append_arg<int64_t>(idx_size_axis);
+    mod.append_arg<int64_t>(idx_size_post);
  } else {
-    args.append<int32_t>(idx_size_pre);
-    args.append<int32_t>(idx_size_axis);
-    args.append<int32_t>(idx_size_post);
+    mod.append_arg<int32_t>(idx_size_pre);
+    mod.append_arg<int32_t>(idx_size_axis);
+    mod.append_arg<int32_t>(idx_size_post);
  }
-  args.append(remove_index(idx.shape(), axis_));
-  args.append(remove_index(src.strides(), axis_));
-  args.append(remove_index(idx.strides(), axis_));
-  args.append<int32_t>(axis_);
-  args.append(src.shape(axis_));
-  args.append(src.strides(axis_));
-  args.append(idx.strides(axis_));
+  mod.append_arg(remove_index(idx.shape(), axis_));
+  mod.append_arg(remove_index(src.strides(), axis_));
+  mod.append_arg(remove_index(idx.strides(), axis_));
+  mod.append_arg<int32_t>(axis_);
+  mod.append_arg(src.shape(axis_));
+  mod.append_arg(src.strides(axis_));
+  mod.append_arg(idx.strides(axis_));

  std::string kernel_name = fmt::format(
      "mlx::core::cu::gather_axis<{}, {}, {}, {}, {}, {}>",
@@ -316,9 +309,9 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  auto kernel = mod.get_kernel(kernel_name);
-  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    mod.launch_kernel(stream, kernel_name, idx, large);
+  });
 }

 void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -384,26 +377,25 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  size_t idx_size_axis = idx.shape(axis_);

-  cu::KernelArgs args;
-  args.append(upd);
-  args.append(idx);
-  args.append(out);
+  mod.append_arg(upd);
+  mod.append_arg(idx);
+  mod.append_arg(out);
  if (large) {
-    args.append<int64_t>(idx_size_pre);
-    args.append<int64_t>(idx_size_axis);
-    args.append<int64_t>(idx_size_post);
+    mod.append_arg<int64_t>(idx_size_pre);
+    mod.append_arg<int64_t>(idx_size_axis);
+    mod.append_arg<int64_t>(idx_size_post);
  } else {
-    args.append<int32_t>(idx_size_pre);
-    args.append<int32_t>(idx_size_axis);
-    args.append<int32_t>(idx_size_post);
+    mod.append_arg<int32_t>(idx_size_pre);
+    mod.append_arg<int32_t>(idx_size_axis);
+    mod.append_arg<int32_t>(idx_size_post);
  }
-  args.append(remove_index(idx.shape(), axis_));
-  args.append(remove_index(upd.strides(), axis_));
-  args.append(remove_index(idx.strides(), axis_));
-  args.append<int32_t>(axis_);
-  args.append(out.shape(axis_));
-  args.append(upd.strides(axis_));
-  args.append(idx.strides(axis_));
+  mod.append_arg(remove_index(idx.shape(), axis_));
+  mod.append_arg(remove_index(upd.strides(), axis_));
+  mod.append_arg(remove_index(idx.strides(), axis_));
+  mod.append_arg<int32_t>(axis_);
+  mod.append_arg(out.shape(axis_));
+  mod.append_arg(upd.strides(axis_));
+  mod.append_arg(idx.strides(axis_));

  std::string kernel_name = fmt::format(
      "mlx::core::cu::scatter_axis<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}, {}>",
@@ -420,9 +412,9 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  auto kernel = mod.get_kernel(kernel_name);
-  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    mod.launch_kernel(stream, kernel_name, idx, large);
+  });
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/jit_module.cpp
+++ b/mlx/backend/cuda/jit_module.cpp
@@ -26,6 +26,16 @@ void check_nvrtc_error(const char* name, nvrtcResult err) {
  }
 }

+#define CHECK_CU_ERROR(cmd) check_cu_error(#cmd, (cmd))
+
+void check_cu_error(const char* name, CUresult err) {
+  if (err != CUDA_SUCCESS) {
+    const char* err_str = "Unknown error";
+    cuGetErrorString(err, &err_str);
+    throw std::runtime_error(fmt::format("{} failed: {}", name, err_str));
+  }
+}
+
 // Return the location of the CUDA toolkit.
 const std::string& cuda_home() {
  static std::string home = []() -> std::string {
@@ -270,13 +280,60 @@ JitModule::JitModule(
  // Load kernels.
  for (const auto& [name, mangled] : ptx_kernels) {
    CUfunction kernel;
-    CHECK_CUDA_ERROR(cuModuleGetFunction(&kernel, module_, mangled.c_str()));
+    CHECK_CU_ERROR(cuModuleGetFunction(&kernel, module_, mangled.c_str()));
    kernels_[name] = kernel;
  }
 }

 JitModule::~JitModule() {
-  CHECK_CUDA_ERROR(cuModuleUnload(module_));
+  CHECK_CU_ERROR(cuModuleUnload(module_));
+}
+
+void JitModule::launch_kernel(
+    CUstream stream,
+    const std::string& kernel_name,
+    const array& arr,
+    bool large,
+    int work_per_thread) {
+  CUfunction kernel = get_kernel(kernel_name);
+  size_t nthreads = cuda::ceil_div(arr.size(), work_per_thread);
+  int _, block_dim;
+  CHECK_CU_ERROR(
+      cuOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel, 0, 0, 0));
+  if (block_dim > nthreads) {
+    block_dim = nthreads;
+  }
+  Dims num_blocks{1, 1, 1};
+  if (large) {
+    num_blocks =
+        get_2d_grid_dims_common(arr.shape(), arr.strides(), work_per_thread);
+    std::get<0>(num_blocks) =
+        (std::get<0>(num_blocks) + block_dim - 1) / block_dim;
+  } else {
+    std::get<0>(num_blocks) = (nthreads + block_dim - 1) / block_dim;
+  }
+  launch_kernel(stream, kernel, num_blocks, Dims{block_dim, 1, 1});
+}
+
+void JitModule::launch_kernel(
+    CUstream stream,
+    CUfunction kernel,
+    Dims num_blocks,
+    Dims block_dims) {
+  CHECK_CU_ERROR(cuLaunchKernel(
+      kernel,
+      std::get<0>(num_blocks),
+      std::get<1>(num_blocks),
+      std::get<2>(num_blocks),
+      std::get<0>(block_dims),
+      std::get<1>(block_dims),
+      std::get<2>(block_dims),
+      0,
+      stream,
+      args_.data(),
+      nullptr));
+  args_.clear();
+  storage_.clear();
 }

 CUfunction JitModule::get_kernel(const std::string& kernel_name) {
@@ -288,6 +345,10 @@ CUfunction JitModule::get_kernel(const std::string& kernel_name) {
  return it->second;
 }

+void JitModule::append_ptr_arg(const void* v) {
+  args_.push_back(const_cast<void*>(v));
+}
+
 JitModule& get_jit_module(
    const mlx::core::Device& device,
    const std::string& name,
--- a/mlx/backend/cuda/jit_module.h
+++ b/mlx/backend/cuda/jit_module.h
@@ -4,7 +4,6 @@

 #include "mlx/array.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/device/config.h"

 #include <deque>
@@ -24,48 +23,72 @@ using KernelBuilderResult = std::pair<
    /* kernel names */ std::vector<std::string>>;
 using KernelBuilder = std::function<KernelBuilderResult()>;

-struct KernelArgs {
-  void** args() {
-    return args_.data();
-  }
+class JitModule {
+ public:
+  JitModule(
+      Device& device,
+      const std::string& module_name,
+      const KernelBuilder& builder);
+  ~JitModule();

-  void append(const array& a) {
-    append(reinterpret_cast<CUdeviceptr>(a.data<void>()));
+  JitModule(const JitModule&) = delete;
+  JitModule& operator=(const JitModule&) = delete;
+
+  void append_arg(const array& a) {
+    append_arg(reinterpret_cast<CUdeviceptr>(a.data<void>()));
  }

  template <typename T>
-  void append(T val) {
+  void append_arg(T val) {
    storage_.emplace_back(val);
-    append_ptr(&storage_.back());
+    append_ptr_arg(&storage_.back());
  }

  template <typename T>
-  void append(std::vector<T> vec) {
+  void append_arg(std::vector<T> vec) {
    if (vec.empty()) {
      // The nullptr can not be used as arg, pass something not null.
-      append(std::monostate{});
+      append_arg(std::monostate{});
    } else {
-      append_ptr(vec.data());
+      append_ptr_arg(vec.data());
      storage_.emplace_back(std::move(vec));
    }
  }

  // Make sure the arg is copied to an array with size of NDIM.
  template <size_t NDIM = MAX_NDIM, typename T>
-  void append_ndim(std::vector<T> vec) {
+  void append_ndim_arg(const std::vector<T>& vec) {
    if (vec.size() > NDIM) {
      throw std::runtime_error(
          fmt::format("ndim can not be larger than {}.", NDIM));
    }
-    vec.resize(NDIM);
-    append(std::move(vec));
+    std::vector<T> copied(NDIM);
+    std::copy(vec.begin(), vec.end(), copied.data());
+    append_arg(std::move(copied));
  }

-  void append_ptr(const void* v) {
-    args_.push_back(const_cast<void*>(v));
-  }
+  // Launch kernel with |kernel_name| that each thread works on
+  // |work_per_thread| elements of |arr|.
+  void launch_kernel(
+      CUstream stream,
+      const std::string& kernel_name,
+      const array& arr,
+      bool large,
+      int work_per_thread = 1);
+
+  void launch_kernel(
+      CUstream stream,
+      CUfunction kernel,
+      Dims num_blocks,
+      Dims block_dims);
+
+  CUfunction get_kernel(const std::string& kernel_name);

 private:
+  void append_ptr_arg(const void* v);
+
+  CUmodule module_{nullptr};
+  std::unordered_map<std::string, CUfunction> kernels_;
  std::vector<void*> args_;

  // The cuLaunchKernel API requires passing pointers to arguments so store
@@ -82,23 +105,6 @@ struct KernelArgs {
  std::deque<Arg> storage_;
 };

-class JitModule {
- public:
-  JitModule(
-      Device& device,
-      const std::string& module_name,
-      const KernelBuilder& builder);
-  ~JitModule();
-
-  JitModule(const JitModule&) = delete;
-  JitModule& operator=(const JitModule&) = delete;
-  CUfunction get_kernel(const std::string& kernel_name);
-
- private:
-  CUmodule module_{nullptr};
-  std::unordered_map<std::string, CUfunction> kernels_;
-};
-
 JitModule& get_jit_module(
    const mlx::core::Device& device,
    const std::string& name,
--- a/mlx/backend/cuda/kernel_utils.cuh
+++ b/mlx/backend/cuda/kernel_utils.cuh
@@ -12,7 +12,6 @@
 #include "mlx/backend/cuda/device/utils.cuh"

 #include <cuComplex.h>
-#include <cuda.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <fmt/format.h>
@@ -121,13 +120,7 @@ std::pair<dim3, dim3> get_grid_and_block(int dim0, int dim1, int dim2);
 template <typename T>
 inline uint max_occupancy_block_dim(T kernel) {
  int _, block_dim;
-  if constexpr (std::is_same_v<T, CUfunction>) {
-    CHECK_CUDA_ERROR(
-        cuOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel, 0, 0, 0));
-  } else {
-    CHECK_CUDA_ERROR(
-        cudaOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel));
-  }
+  CHECK_CUDA_ERROR(cudaOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel));
  return block_dim;
 }

--- a/mlx/backend/cuda/layer_norm.cu
+++ b/mlx/backend/cuda/layer_norm.cu
@@ -258,23 +258,23 @@ void LayerNorm::eval_gpu(
  encoder.set_input_array(w);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
-  dispatch_float_types(out.dtype(), "layernorm", [&](auto type_tag) {
-    constexpr uint32_t N_READS = 4;
-    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      auto kernel = cu::layer_norm<DataType, block_dim(), N_READS>;
-      encoder.add_kernel_node(
-          kernel,
-          n_rows,
-          block_dim(),
-          x.data<DataType>(),
-          w.data<DataType>(),
-          b.data<DataType>(),
-          out.data<DataType>(),
-          eps_,
-          axis_size,
-          w_stride,
-          b_stride);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_float_types(out.dtype(), "layernorm", [&](auto type_tag) {
+      constexpr uint32_t N_READS = 4;
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            auto kernel = cu::layer_norm<DataType, block_dim(), N_READS>;
+            kernel<<<n_rows, block_dim(), 0, stream>>>(
+                x.data<DataType>(),
+                w.data<DataType>(),
+                b.data<DataType>(),
+                out.data<DataType>(),
+                eps_,
+                axis_size,
+                w_stride,
+                b_stride);
+          });
    });
  });
 }
@@ -289,25 +289,21 @@ void LayerNormVJP::eval_gpu(
  // Ensure row contiguity. We could relax this step by checking that the array
  // is contiguous (no broadcasts or holes) and that the input strides are the
  // same as the cotangent strides but for now this is simpler.
-  auto check_input = [&s](const array& x, bool& copied) {
+  auto check_input = [&s](const array& x) -> std::pair<array, bool> {
    if (x.flags().row_contiguous) {
-      copied = false;
-      return x;
+      return {x, false};
    }
-    copied = true;
    array x_copy(x.shape(), x.dtype(), nullptr, {});
    copy_gpu(x, x_copy, CopyType::General, s);
-    return x_copy;
+    return {x_copy, true};
  };
  bool donate_x = inputs[0].is_donatable();
  bool donate_g = inputs[3].is_donatable();
-  bool copied;
-  auto x = check_input(inputs[0], copied);
+  auto [x, copied] = check_input(inputs[0]);
  donate_x |= copied;
  const array& w = inputs[1];
  const array& b = inputs[2];
-  bool g_copied;
-  auto g = check_input(inputs[3], g_copied);
+  auto [g, g_copied] = check_input(inputs[3]);
  donate_g |= g_copied;
  array& gx = outputs[0];
  array& gw = outputs[1];
@@ -338,10 +334,8 @@ void LayerNormVJP::eval_gpu(
  // gradient accumulators.
  array gw_temp =
      (has_w) ? array({n_rows, x.shape().back()}, gw.dtype(), nullptr, {}) : w;
-  bool g_in_gw = false;
  if (has_w) {
    if (!g_in_gx && donate_g) {
-      g_in_gw = true;
      gw_temp.copy_shared_buffer(g);
    } else {
      gw_temp.set_data(allocator::malloc(gw_temp.nbytes()));
@@ -349,47 +343,41 @@ void LayerNormVJP::eval_gpu(
    }
  }

-  // The gradient for b in case we had a b.
-  bool has_gb = (gb.ndim() == 1 && gb.size() == axis_size);
-  if (has_gb) {
+  // Finish with the gradient for b in case we had a b.
+  if (gb.ndim() == 1 && gb.size() == axis_size) {
    ReductionPlan plan(
        ReductionOpType::ContiguousStridedReduce, {n_rows}, {axis_size});
    col_reduce(encoder, g, gb, Reduce::ReduceType::Sum, {0}, plan);
  }

-  // Insert dependency if `g` was donated
-  if ((g_in_gx || g_in_gw) && has_gb) {
-    encoder.set_input_array(gb);
-  }
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(g);
  encoder.set_output_array(gx);
  encoder.set_output_array(gw_temp);
-  dispatch_float_types(gx.dtype(), "layernorm_vjp", [&](auto type_tag) {
-    dispatch_bool(has_w, [&](auto has_w_constant) {
-      constexpr int N_READS = 4;
-      dispatch_block_dim(
-          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-            auto kernel = cu::layer_norm_vjp<
-                DataType,
-                has_w_constant.value,
-                block_dim(),
-                N_READS>;
-            encoder.add_kernel_node(
-                kernel,
-                n_rows,
-                block_dim(),
-                x.data<DataType>(),
-                w.data<DataType>(),
-                g.data<DataType>(),
-                gx.data<DataType>(),
-                gw_temp.data<DataType>(),
-                eps_,
-                axis_size,
-                w_stride);
-          });
+  encoder.launch_kernel([&, x = x, g = g](cudaStream_t stream) {
+    dispatch_float_types(gx.dtype(), "layernorm_vjp", [&](auto type_tag) {
+      dispatch_bool(has_w, [&](auto has_w_constant) {
+        constexpr int N_READS = 4;
+        dispatch_block_dim(
+            cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+              using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+              auto kernel = cu::layer_norm_vjp<
+                  DataType,
+                  has_w_constant.value,
+                  block_dim(),
+                  N_READS>;
+              kernel<<<n_rows, block_dim(), 0, stream>>>(
+                  x.data<DataType>(),
+                  w.data<DataType>(),
+                  g.data<DataType>(),
+                  gx.data<DataType>(),
+                  gw_temp.data<DataType>(),
+                  eps_,
+                  axis_size,
+                  w_stride);
+            });
+      });
    });
  });

--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -143,18 +143,16 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {

  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  dispatch_float_types(out.dtype(), "logsumexp", [&](auto type_tag) {
-    constexpr int N_READS = 4;
-    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      auto kernel = cu::logsumexp<DataType, float, block_dim(), N_READS>;
-      encoder.add_kernel_node(
-          kernel,
-          n_rows,
-          block_dim(),
-          in.data<DataType>(),
-          out.data<DataType>(),
-          axis_size);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_float_types(out.dtype(), "logsumexp", [&](auto type_tag) {
+      constexpr int N_READS = 4;
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            auto kernel = cu::logsumexp<DataType, float, block_dim(), N_READS>;
+            kernel<<<n_rows, block_dim(), 0, stream>>>(
+                in.data<DataType>(), out.data<DataType>(), axis_size);
+          });
    });
  });
 }
--- a/mlx/backend/cuda/matmul.cpp
+++ b/mlx/backend/cuda/matmul.cpp
@@ -42,8 +42,7 @@ class MatMul {
      int64_t ldb,
      int32_t batch_count,
      int64_t a_batch_stride,
-      int64_t b_batch_stride)
-      : handle_(device.lt_handle()) {
+      int64_t b_batch_stride) {
    heuristic_.state = CUBLAS_STATUS_NOT_INITIALIZED;

    auto scale_type = dtype_to_cuda_type(dtype);
@@ -148,7 +147,7 @@ class MatMul {
    if (heuristic_.state != CUBLAS_STATUS_SUCCESS) {
      int ret = 0;
      CHECK_CUBLAS_ERROR(cublasLtMatmulAlgoGetHeuristic(
-          handle_,
+          encoder.device().lt_handle(),
          matmul_desc_,
          a_desc_,
          b_desc_,
@@ -173,24 +172,25 @@ class MatMul {
      workspace_ptr = workspace.data<void>();
    }

-    auto capture = encoder.capture_context();
-    CHECK_CUBLAS_ERROR(cublasLtMatmul(
-        handle_,
-        matmul_desc_,
-        &alpha,
-        a,
-        a_desc_,
-        b,
-        b_desc_,
-        &beta,
-        c ? c : out,
-        c ? c_desc_ : out_desc_,
-        out,
-        out_desc_,
-        &heuristic_.algo,
-        workspace_ptr,
-        heuristic_.workspaceSize,
-        encoder.stream()));
+    encoder.launch_kernel([&](cudaStream_t stream) {
+      CHECK_CUBLAS_ERROR(cublasLtMatmul(
+          encoder.device().lt_handle(),
+          matmul_desc_,
+          &alpha,
+          a,
+          a_desc_,
+          b,
+          b_desc_,
+          &beta,
+          c ? c : out,
+          c ? c_desc_ : out_desc_,
+          out,
+          out_desc_,
+          &heuristic_.algo,
+          workspace_ptr,
+          heuristic_.workspaceSize,
+          stream));
+    });
  }

 private:
@@ -259,7 +259,6 @@ class MatMul {
    return desc;
  }

-  cublasLtHandle_t handle_{nullptr};
  cublasLtMatmulDesc_t matmul_desc_{nullptr};
  cublasLtMatmulPreference_t pref_{nullptr};
  cublasLtMatrixLayout_t a_desc_{nullptr};
@@ -274,7 +273,7 @@ class MatMul {
 namespace {

 std::tuple<bool, int64_t, array>
-check_transpose(cu::CommandEncoder& enc, const Stream& s, const array& arr) {
+check_transpose(std::vector<array>& copies, const Stream& s, const array& arr) {
  auto stx = arr.strides()[arr.ndim() - 2];
  auto sty = arr.strides()[arr.ndim() - 1];
  if (sty == 1 && stx == arr.shape(-1)) {
@@ -284,7 +283,7 @@ check_transpose(cu::CommandEncoder& enc, const Stream& s, const array& arr) {
  } else {
    array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
    copy_gpu(arr, arr_copy, CopyType::General, s);
-    enc.add_temporary(arr_copy);
+    copies.push_back(arr_copy);
    return std::make_tuple(false, arr.shape(-1), arr_copy);
  }
 }
@@ -318,8 +317,13 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
-  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
-  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);
+  std::vector<array> copies;
+  auto [a_transposed, lda, a] = check_transpose(copies, s, a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(copies, s, b_pre);
+
+  for (auto& temp : copies) {
+    encoder.add_temporary(temp);
+  }

  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions
@@ -344,7 +348,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  // Invoke cublasLt

  cu::MatMul matmul(
-      cu::device(s.device),
+      encoder.device(),
      a.dtype(),
      a_transposed,
      M,
@@ -369,7 +373,6 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {

  ContiguousIterator a_it(batch_shape, a_batch_strides, batch_shape.size() - 1);
  ContiguousIterator b_it(batch_shape, b_batch_strides, batch_shape.size() - 1);
-  auto concurrent = encoder.concurrent_context();
  for (size_t i = 0; i < nbatch; ++i) {
    matmul.run(
        encoder,
@@ -402,9 +405,14 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
-  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
-  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);
-  auto [c_transposed, ldc, c] = check_transpose(encoder, s, c_pre);
+  std::vector<array> copies;
+  auto [a_transposed, lda, a] = check_transpose(copies, s, a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(copies, s, b_pre);
+  auto [c_transposed, ldc, c] = check_transpose(copies, s, c_pre);
+
+  for (auto& temp : copies) {
+    encoder.add_temporary(temp);
+  }

  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions
@@ -432,7 +440,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  // Invoke cublasLt

  cu::MatMul matmul(
-      cu::device(s.device),
+      encoder.device(),
      a.dtype(),
      a_transposed,
      M,
@@ -470,7 +478,6 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  ContiguousIterator a_it(batch_shape, a_batch_strides, batch_shape.size() - 1);
  ContiguousIterator b_it(batch_shape, b_batch_strides, batch_shape.size() - 1);
  ContiguousIterator c_it(batch_shape, c_batch_strides, batch_shape.size() - 1);
-  auto concurrent = encoder.concurrent_context();
  for (size_t i = 0; i < nbatch; ++i) {
    matmul.run(
        encoder,
--- a/mlx/backend/cuda/primitives.cu
+++ b/mlx/backend/cuda/primitives.cu
@@ -24,21 +24,23 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  if (out.size() == 0) {
    return;
  }
-  auto& encoder = cu::get_command_encoder(stream());
+  auto& s = stream();
+  auto& encoder = cu::get_command_encoder(s);
  encoder.set_output_array(out);
-  auto capture = encoder.capture_context();
-  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
-    using CTYPE = MLX_GET_TYPE(type_tag);
-    using OutType = cuda_type_t<CTYPE>;
-    CTYPE step =
-        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
-    thrust::transform(
-        cu::thrust_policy(encoder.stream()),
-        thrust::counting_iterator<uint32_t>(0),
-        thrust::counting_iterator<uint32_t>(out.data_size()),
-        thrust::device_pointer_cast(out.data<OutType>()),
-        cu::Arange<OutType>{
-            static_cast<OutType>(start_), static_cast<OutType>(step)});
+  encoder.launch_kernel([&, this](cudaStream_t stream) {
+    dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
+      using CTYPE = MLX_GET_TYPE(type_tag);
+      using OutType = cuda_type_t<CTYPE>;
+      CTYPE step =
+          static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
+      thrust::transform(
+          cu::thrust_policy(stream),
+          thrust::counting_iterator<uint32_t>(0),
+          thrust::counting_iterator<uint32_t>(out.data_size()),
+          thrust::device_pointer_cast(out.data<OutType>()),
+          cu::Arange<OutType>{
+              static_cast<OutType>(start_), static_cast<OutType>(step)});
+    });
  });
 }

--- a/mlx/backend/cuda/random.cu
+++ b/mlx/backend/cuda/random.cu
@@ -156,39 +156,34 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(keys);
  encoder.set_output_array(out);
-  dim3 grid_dims{num_keys, half_size + odd};
-  int64_t total = grid_dims.x * grid_dims.y;
-  int32_t threads_y = 1;
-  while ((total / threads_y) >= (1U << 31)) {
-    threads_y *= 2;
-  }
-  int32_t threads_x = cuda::ceil_div(total, threads_y);
-  auto [grid, block] = get_grid_and_block(threads_x, threads_y, 1);
-  auto& stream = encoder.stream();
-  if (keys.flags().row_contiguous) {
-    encoder.add_kernel_node(
-        cu::rbitsc,
-        grid,
-        block,
-        keys.data<uint32_t>(),
-        out.data<uint8_t>(),
-        grid_dims,
-        odd,
-        bytes_per_key);
-  } else {
-    encoder.add_kernel_node(
-        cu::rbits,
-        grid,
-        block,
-        keys.data<uint32_t>(),
-        out.data<uint8_t>(),
-        grid_dims,
-        odd,
-        bytes_per_key,
-        keys.ndim(),
-        const_param(keys.shape()),
-        const_param(keys.strides()));
-  }
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dim3 grid_dims{num_keys, half_size + odd};
+    int64_t total = grid_dims.x * grid_dims.y;
+    int32_t threads_y = 1;
+    while ((total / threads_y) >= (1U << 31)) {
+      threads_y *= 2;
+    }
+    int32_t threads_x = cuda::ceil_div(total, threads_y);
+    auto [grid, block] = get_grid_and_block(threads_x, threads_y, 1);
+    if (keys.flags().row_contiguous) {
+      cu::rbitsc<<<grid, block, 0, stream>>>(
+          keys.data<uint32_t>(),
+          out.data<uint8_t>(),
+          grid_dims,
+          odd,
+          bytes_per_key);
+    } else {
+      cu::rbits<<<grid, block, 0, stream>>>(
+          keys.data<uint32_t>(),
+          out.data<uint8_t>(),
+          grid_dims,
+          odd,
+          bytes_per_key,
+          keys.ndim(),
+          const_param(keys.shape()),
+          const_param(keys.strides()));
+    }
+  });
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/reduce/all_reduce.cu
+++ b/mlx/backend/cuda/reduce/all_reduce.cu
@@ -37,15 +37,15 @@ __global__ void all_reduce(T* in, U* out, size_t block_step, size_t size) {
  for (; i + block.size() * N <= check; i += block.size() * N) {
    cub::LoadDirectBlockedVectorized<T, N>(block.thread_rank(), in + i, vals);
    for (int j = 0; j < N; j++) {
-      accs[0] = op(accs[0], cast_to<U>(vals[j]));
+      accs[0] = op(accs[0], __cast<U, T>(vals[j]));
    }
  }

  if (i < check) {
    cub::LoadDirectBlocked(
-        block.thread_rank(), in + i, vals, check - i, cast_to<T>(init));
+        block.thread_rank(), in + i, vals, check - i, __cast<T, U>(init));
    for (int i = 0; i < N; i++) {
-      accs[0] = op(accs[0], cast_to<U>(vals[i]));
+      accs[0] = op(accs[0], __cast<U, T>(vals[i]));
    }
  }

@@ -110,20 +110,19 @@ void all_reduce(
    intermediate.set_data(allocator::malloc(intermediate.nbytes()));
    encoder.add_temporary(intermediate);
    encoder.set_output_array(intermediate);
-    dispatch_all_types(dt, [&](auto type_tag) {
-      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-        using OP = MLX_GET_TYPE(reduce_type_tag);
-        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-        using U = typename cu::ReduceResult<OP, T>::type;
-        auto kernel = cu::all_reduce<T, U, OP, N_READS>;
-        encoder.add_kernel_node(
-            kernel,
-            blocks,
-            threads,
-            static_cast<T*>(indata),
-            intermediate.data<U>(),
-            block_step,
-            insize);
+    encoder.launch_kernel([&](cudaStream_t stream) {
+      dispatch_all_types(dt, [&](auto type_tag) {
+        dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+          using OP = MLX_GET_TYPE(reduce_type_tag);
+          using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+          using U = typename cu::ReduceResult<OP, T>::type;
+          auto kernel = cu::all_reduce<T, U, OP, N_READS>;
+          kernel<<<blocks, threads, 0, stream>>>(
+              static_cast<T*>(indata),
+              intermediate.data<U>(),
+              block_step,
+              insize);
+        });
      });
    });

@@ -136,20 +135,16 @@ void all_reduce(
  }

  encoder.set_output_array(out);
-  dispatch_all_types(dt, [&](auto type_tag) {
-    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-      using OP = MLX_GET_TYPE(reduce_type_tag);
-      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      using U = typename cu::ReduceResult<OP, T>::type;
-      auto kernel = cu::all_reduce<T, U, OP, N_READS>;
-      encoder.add_kernel_node(
-          kernel,
-          blocks,
-          threads,
-          static_cast<T*>(indata),
-          out.data<U>(),
-          block_step,
-          insize);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(dt, [&](auto type_tag) {
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        using U = typename cu::ReduceResult<OP, T>::type;
+        auto kernel = cu::all_reduce<T, U, OP, N_READS>;
+        kernel<<<blocks, threads, 0, stream>>>(
+            static_cast<T*>(indata), out.data<U>(), block_step, insize);
+      });
    });
  });
 }
--- a/mlx/backend/cuda/reduce/col_reduce.cu
+++ b/mlx/backend/cuda/reduce/col_reduce.cu
@@ -3,6 +3,7 @@
 #include <numeric>

 #include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/reduce/reduce.cuh"

 #include <cooperative_groups.h>
@@ -127,7 +128,7 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
        T vals[N_READS];
        cub::LoadDirectBlockedVectorized(thread_x, in + loop.location(), vals);
        for (int i = 0; i < N_READS; i++) {
-          totals[i] = op(totals[i], cast_to<U>(vals[i]));
+          totals[i] = op(totals[i], __cast<U, T>(vals[i]));
        }
        loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
      }
@@ -136,7 +137,7 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
        T vals[N_READS];
        cub::LoadDirectBlocked(thread_x, in + loop.location(), vals);
        for (int i = 0; i < N_READS; i++) {
-          totals[i] = op(totals[i], cast_to<U>(vals[i]));
+          totals[i] = op(totals[i], __cast<U, T>(vals[i]));
        }
        loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
      }
@@ -149,9 +150,9 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
          in + loop.location(),
          vals,
          args.reduction_stride - tile_x * BN,
-          cast_to<T>(ReduceInit<Op, T>::value()));
+          __cast<T, U>(ReduceInit<Op, T>::value()));
      for (int i = 0; i < N_READS; i++) {
-        totals[i] = op(totals[i], cast_to<U>(vals[i]));
+        totals[i] = op(totals[i], __cast<U, T>(vals[i]));
      }
      loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
    }
@@ -213,24 +214,26 @@ void col_reduce_looped(

  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  dispatch_all_types(in.dtype(), [&](auto type_tag) {
-    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
-        using OP = MLX_GET_TYPE(reduce_type_tag);
-        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-        using U = typename cu::ReduceResult<OP, T>::type;
-        // Cub doesn't like const pointers for vectorized loads. (sigh)
-        T* indata = const_cast<T*>(in.data<T>());
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(in.dtype(), [&](auto type_tag) {
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
+          using OP = MLX_GET_TYPE(reduce_type_tag);
+          using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+          using U = typename cu::ReduceResult<OP, T>::type;

-        constexpr int N_READS = 4;
-        constexpr int BM = 32;
-        constexpr int BN = 32;
-        dim3 grid = output_grid_for_col_reduce(out, args, BN);
-        int blocks = BM * BN / N_READS;
-        auto kernel =
-            cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
-        encoder.add_kernel_node(
-            kernel, grid, blocks, indata, out.data<U>(), args);
+          // Cub doesn't like const pointers for vectorized loads. (sigh)
+          T* indata = const_cast<T*>(in.data<T>());
+
+          constexpr int N_READS = 4;
+          constexpr int BM = 32;
+          constexpr int BN = 32;
+          dim3 grid = output_grid_for_col_reduce(out, args, BN);
+          int blocks = BM * BN / N_READS;
+          auto kernel =
+              cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
+          kernel<<<grid, blocks, 0, stream>>>(indata, out.data<U>(), args);
+        });
      });
    });
  });
--- a/mlx/backend/cuda/reduce/init_reduce.cu
+++ b/mlx/backend/cuda/reduce/init_reduce.cu
@@ -32,16 +32,18 @@ void init_reduce(
  }

  encoder.set_output_array(out);
-  dispatch_all_types(in.dtype(), [&](auto type_tag) {
-    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-      using OP = MLX_GET_TYPE(reduce_type_tag);
-      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      using U = typename cu::ReduceResult<OP, T>::type;
-      auto kernel = cu::init_reduce<T, U, OP>;
-      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
-      dim3 block(grid.x < 1024 ? grid.x : 1024, 1, 1);
-      grid.x = (grid.x + 1023) / 1024;
-      encoder.add_kernel_node(kernel, grid, block, out.data<U>(), out.size());
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(in.dtype(), [&](auto type_tag) {
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        using U = typename cu::ReduceResult<OP, T>::type;
+        auto kernel = cu::init_reduce<T, U, OP>;
+        dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
+        dim3 block(grid.x < 1024 ? grid.x : 1024, 1, 1);
+        grid.x = (grid.x + 1023) / 1024;
+        kernel<<<grid, block, 0, stream>>>(out.data<U>(), out.size());
+      });
    });
  });
 }
--- a/mlx/backend/cuda/reduce/reduce_ops.cuh
+++ b/mlx/backend/cuda/reduce/reduce_ops.cuh
@@ -2,8 +2,6 @@

 #pragma once

-#include "mlx/backend/cuda/device/atomic_ops.cuh"
-#include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/device/utils.cuh"
 #include "mlx/backend/cuda/reduce/reduce_utils.cuh"

@@ -42,15 +40,15 @@ struct Sum {
  }

  __device__ void atomic_update(__nv_bfloat16* x, __nv_bfloat16 y) {
-    atomic_add(x, y);
+    atomicAdd(x, y);
  }

  __device__ void atomic_update(int* x, int y) {
-    atomic_add(x, y);
+    atomicAdd(x, y);
  }

  __device__ void atomic_update(float* x, float y) {
-    atomic_add(x, y);
+    atomicAdd(x, y);
  }
 };

@@ -154,7 +152,7 @@ struct ReduceInit<Sum, T> {
    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
      return T{0, 0};
    } else {
-      return cast_to<typename ReduceResult<Sum, T>::type>(0);
+      return typename ReduceResult<Sum, T>::type{0};
    }
  }
 };
@@ -165,7 +163,7 @@ struct ReduceInit<Prod, T> {
    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
      return T{1, 0};
    } else {
-      return cast_to<typename ReduceResult<Prod, T>::type>(1);
+      return typename ReduceResult<Prod, T>::type{1};
    }
  }
 };
--- a/mlx/backend/cuda/reduce/reduce_utils.cuh
+++ b/mlx/backend/cuda/reduce/reduce_utils.cuh
@@ -55,6 +55,22 @@ __device__ void atomic_reduce(T* x, T y) {
  }
 }

+// TODO: Should make a custom complex type
+template <typename U, typename T>
+inline __device__ U __cast(T x) {
+  return static_cast<U>(x);
+}
+
+template <>
+inline __device__ bool __cast<bool, cuComplex>(cuComplex x) {
+  return x.x != 0 && x.y != 0;
+}
+
+template <>
+inline __device__ cuComplex __cast<cuComplex, bool>(bool x) {
+  return x ? make_cuFloatComplex(1, 1) : make_cuFloatComplex(0, 0);
+}
+
 template <typename T, int N, typename Block, typename Warp, typename Op>
 inline __device__ void
 block_reduce(Block block, Warp warp, T (&vals)[N], T* smem, Op op, T init) {
--- a/mlx/backend/cuda/reduce/row_reduce.cu
+++ b/mlx/backend/cuda/reduce/row_reduce.cu
@@ -3,6 +3,7 @@
 #include <numeric>

 #include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/reduce/reduce.cuh"

 #include <cooperative_groups.h>
@@ -112,7 +113,7 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
            in + k * size + r * (block.size() * N),
            vals[k]);
        for (int j = 0; j < N; j++) {
-          accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
+          accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
        }
      }
    }
@@ -124,7 +125,7 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
            in + k * size + r * (block.size() * N),
            vals[k]);
        for (int j = 0; j < N; j++) {
-          accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
+          accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
        }
      }
    }
@@ -137,9 +138,9 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
          in + k * size + final_offset,
          vals[k],
          size,
-          cast_to<T>(init));
+          __cast<T, U>(init));
      for (int j = 0; j < N; j++) {
-        accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
+        accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
      }
    }
  }
@@ -198,7 +199,7 @@ __global__ void row_reduce_looped(
          in + loop.location() + r * BLOCK_DIM * N_READS,
          vals);
      for (int i = 0; i < N_READS; i++) {
-        total[0] = op(total[0], cast_to<U>(vals[i]));
+        total[0] = op(total[0], __cast<U, T>(vals[i]));
      }
    }
    if (final_offset < args.row_size) {
@@ -208,9 +209,9 @@ __global__ void row_reduce_looped(
          in + loop.location() + final_offset,
          vals,
          args.row_size - final_offset,
-          cast_to<T>(init));
+          __cast<T, U>(init));
      for (int i = 0; i < N_READS; i++) {
-        total[0] = op(total[0], cast_to<U>(vals[i]));
+        total[0] = op(total[0], __cast<U, T>(vals[i]));
      }
    }
    // TODO: Maybe block.sync() here?
@@ -244,32 +245,34 @@ void row_reduce_simple(
  //       2 passes. Something like 32 * out.size() and then do a warp reduce.
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  dispatch_all_types(in.dtype(), [&](auto type_tag) {
-    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-      using OP = MLX_GET_TYPE(reduce_type_tag);
-      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      using U = typename cu::ReduceResult<OP, T>::type;
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(in.dtype(), [&](auto type_tag) {
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        using U = typename cu::ReduceResult<OP, T>::type;

-      // Cub doesn't like const pointers for vectorized loads. (sigh)
-      T* indata = const_cast<T*>(in.data<T>());
+        // Cub doesn't like const pointers for vectorized loads. (sigh)
+        T* indata = const_cast<T*>(in.data<T>());

-      // Calculate the grid and block dims
-      size_t reductions = (plan.shape.back() + N_READS - 1) / N_READS;
-      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
-      int threads = std::min(1024UL, reductions);
-      threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
-      dim3 block(threads, 1, 1);
+        // Calculate the grid and block dims
+        size_t reductions = (plan.shape.back() + N_READS - 1) / N_READS;
+        dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
+        int threads = std::min(1024UL, reductions);
+        threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+        dim3 block(threads, 1, 1);

-      // Pick the kernel
-      auto kernel = cu::row_reduce_simple<T, U, OP, N_READS>;
-      if (grid.x >= 1024) {
-        grid.x = (grid.x + 1) / 2;
-        kernel = cu::row_reduce_simple<T, U, OP, N_READS, 2>;
-      }
+        // Pick the kernel
+        auto kernel = cu::row_reduce_simple<T, U, OP, N_READS>;
+        if (grid.x >= 1024) {
+          grid.x = (grid.x + 1) / 2;
+          kernel = cu::row_reduce_simple<T, U, OP, N_READS, 2>;
+        }

-      int size = plan.shape.back();
-      encoder.add_kernel_node(
-          kernel, grid, block, indata, out.data<U>(), out.size(), size);
+        // Launch
+        kernel<<<grid, block, 0, stream>>>(
+            indata, out.data<U>(), out.size(), plan.shape.back());
+      });
    });
  });
 }
@@ -290,39 +293,43 @@ void row_reduce_looped(

  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  dispatch_all_types(in.dtype(), [&](auto type_tag) {
-    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-      using OP = MLX_GET_TYPE(reduce_type_tag);
-      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      using U = typename cu::ReduceResult<OP, T>::type;
-      // Cub doesn't like const pointers for vectorized loads. (sigh)
-      T* indata = const_cast<T*>(in.data<T>());
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(in.dtype(), [&](auto type_tag) {
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        using U = typename cu::ReduceResult<OP, T>::type;

-      // Calculate the grid and block dims
-      args.sort_access_pattern(in, axes);
-      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
-      size_t reductions = (args.row_size + N_READS - 1) / N_READS;
-      int threads = std::min(1024UL, reductions);
-      threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
-      dim3 block(threads, 1, 1);
+        // Cub doesn't like const pointers for vectorized loads. (sigh)
+        T* indata = const_cast<T*>(in.data<T>());

-      // Pick the kernel
-      auto kernel = cu::row_reduce_looped<T, U, OP, 1, 32, N_READS>;
-      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
-        dispatch_block_dim(threads, [&](auto threads_constant) {
-          kernel = cu::row_reduce_looped<
-              T,
-              U,
-              OP,
-              reduce_ndim.value,
-              threads_constant.value,
-              N_READS>;
-          block.x = threads_constant.value;
+        // Calculate the grid and block dims
+        args.sort_access_pattern(in, axes);
+        dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
+        size_t reductions = (args.row_size + N_READS - 1) / N_READS;
+        int threads = std::min(1024UL, reductions);
+        threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+        dim3 block(threads, 1, 1);
+
+        // Pick the kernel
+        auto kernel = cu::row_reduce_looped<T, U, OP, 1, 32, N_READS>;
+        dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
+          dispatch_block_dim(threads, [&](auto threads_constant) {
+            kernel = cu::row_reduce_looped<
+                T,
+                U,
+                OP,
+                reduce_ndim.value,
+                threads_constant.value,
+                N_READS>;
+            block.x = threads_constant.value;
+          });
        });
-      });

-      encoder.add_kernel_node(
-          kernel, grid, block, indata, out.data<U>(), out.size(), args);
+        // Launch
+        kernel<<<grid, block, 0, stream>>>(
+            indata, out.data<U>(), out.size(), args);
+      });
    });
  });
 }
--- a/mlx/backend/cuda/rms_norm.cu
+++ b/mlx/backend/cuda/rms_norm.cu
@@ -74,7 +74,7 @@ __global__ void rms_norm(
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    T xn[N_READS];
-    cub::LoadDirectBlocked(index, x, xn, axis_size, cast_to<T>(0));
+    cub::LoadDirectBlocked(index, x, xn, axis_size, 0);
    for (int i = 0; i < N_READS; ++i) {
      float t = static_cast<float>(xn[i]);
      normalizer += t * t;
@@ -130,7 +130,7 @@ __global__ void rms_norm_vjp(
    T wn[N_READS] = {};
    T gn[N_READS] = {};
    auto index = r * BLOCK_DIM + block.thread_rank();
-    cub::LoadDirectBlocked(index, x, xn, axis_size, cast_to<T>(0));
+    cub::LoadDirectBlocked(index, x, xn, axis_size, 0);
    cub::LoadDirectBlocked(index, g, gn, axis_size);
    cub::LoadDirectBlocked(index, strided_iterator(w, w_stride), wn, axis_size);
    for (int i = 0; i < N_READS; i++) {
@@ -224,21 +224,21 @@ void RMSNorm::eval_gpu(
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_output_array(out);
-  dispatch_float_types(out.dtype(), "rms_norm", [&](auto type_tag) {
-    constexpr uint32_t N_READS = 4;
-    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      auto kernel = cu::rms_norm<DataType, block_dim(), N_READS>;
-      encoder.add_kernel_node(
-          kernel,
-          n_rows,
-          block_dim(),
-          x.data<DataType>(),
-          w.data<DataType>(),
-          out.data<DataType>(),
-          eps_,
-          axis_size,
-          w_stride);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_float_types(out.dtype(), "rms_norm", [&](auto type_tag) {
+      constexpr uint32_t N_READS = 4;
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            auto kernel = cu::rms_norm<DataType, block_dim(), N_READS>;
+            kernel<<<n_rows, block_dim(), 0, stream>>>(
+                x.data<DataType>(),
+                w.data<DataType>(),
+                out.data<DataType>(),
+                eps_,
+                axis_size,
+                w_stride);
+          });
    });
  });
 }
@@ -253,24 +253,20 @@ void RMSNormVJP::eval_gpu(
  // Ensure row contiguity. We could relax this step by checking that the array
  // is contiguous (no broadcasts or holes) and that the input strides are the
  // same as the cotangent strides but for now this is simpler.
-  auto check_input = [&s](const array& x, bool& copied) {
+  auto check_input = [&s](const array& x) -> std::pair<array, bool> {
    if (x.flags().row_contiguous) {
-      copied = false;
-      return x;
+      return {x, false};
    }
-    copied = true;
    array x_copy(x.shape(), x.dtype(), nullptr, {});
    copy_gpu(x, x_copy, CopyType::General, s);
-    return x_copy;
+    return {x_copy, true};
  };
  bool donate_x = inputs[0].is_donatable();
  bool donate_g = inputs[2].is_donatable();
-  bool copied;
-  auto x = check_input(inputs[0], copied);
+  auto [x, copied] = check_input(inputs[0]);
  donate_x |= copied;
  const array& w = inputs[1];
-  bool g_copied;
-  auto g = check_input(inputs[2], g_copied);
+  auto [g, g_copied] = check_input(inputs[2]);
  donate_g |= g_copied;
  array& gx = outputs[0];
  array& gw = outputs[1];
@@ -314,31 +310,30 @@ void RMSNormVJP::eval_gpu(
  encoder.set_input_array(g);
  encoder.set_output_array(gx);
  encoder.set_output_array(gw_temp);
-  dispatch_float_types(gx.dtype(), "rms_norm_vjp", [&](auto type_tag) {
-    dispatch_bool(has_w, [&](auto has_w_constant) {
-      constexpr int N_READS = 4;
-      dispatch_block_dim(
-          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-            constexpr int N_READS = 4;
-            auto kernel = cu::rms_norm_vjp<
-                DataType,
-                has_w_constant.value,
-                block_dim(),
-                N_READS>;
-            encoder.add_kernel_node(
-                kernel,
-                n_rows,
-                block_dim(),
-                x.data<DataType>(),
-                w.data<DataType>(),
-                g.data<DataType>(),
-                gx.data<DataType>(),
-                gw_temp.data<DataType>(),
-                eps_,
-                axis_size,
-                w_stride);
-          });
+  encoder.launch_kernel([&, x = x, g = g](cudaStream_t stream) {
+    dispatch_float_types(gx.dtype(), "rms_norm_vjp", [&](auto type_tag) {
+      dispatch_bool(has_w, [&](auto has_w_constant) {
+        constexpr int N_READS = 4;
+        dispatch_block_dim(
+            cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+              using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+              constexpr int N_READS = 4;
+              auto kernel = cu::rms_norm_vjp<
+                  DataType,
+                  has_w_constant.value,
+                  block_dim(),
+                  N_READS>;
+              kernel<<<n_rows, block_dim(), 0, stream>>>(
+                  x.data<DataType>(),
+                  w.data<DataType>(),
+                  g.data<DataType>(),
+                  gx.data<DataType>(),
+                  gw_temp.data<DataType>(),
+                  eps_,
+                  axis_size,
+                  w_stride);
+            });
+      });
    });
  });

--- a/mlx/backend/cuda/rope.cu
+++ b/mlx/backend/cuda/rope.cu
@@ -308,89 +308,76 @@ void RoPE::eval_gpu(
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(donated ? out : in);
  encoder.set_input_array(offset);
-  if (with_freqs) {
-    encoder.set_input_array(inputs[2]);
-  }
  encoder.set_output_array(out);
-  dispatch_float_types(out.dtype(), "rope", [&](auto type_tag) {
-    dispatch_bool(traditional_, [&](auto traditional) {
-      dispatch_bool(forward_, [&](auto forward) {
-        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-        if (single && !with_freqs) {
-          auto kernel =
-              cu::rope_single<DataType, traditional.value, forward.value>;
-          uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
-          auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
-          encoder.add_kernel_node(
-              kernel,
-              grid,
-              block,
-              (donated ? out : in).data<DataType>(),
-              out.data<DataType>(),
-              offset.data<int32_t>(),
-              scale_,
-              std::log2(base_),
-              mat_size,
-              dims);
-        } else if (single) {
-          auto kernel =
-              cu::rope_single_freqs<DataType, traditional.value, forward.value>;
-          uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
-          auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
-          encoder.add_kernel_node(
-              kernel,
-              grid,
-              block,
-              (donated ? out : in).data<DataType>(),
-              out.data<DataType>(),
-              offset.data<int32_t>(),
-              inputs[2].data<float>(),
-              scale_,
-              mat_size,
-              dims,
-              inputs[2].strides(0));
-        } else if (with_freqs) {
-          auto kernel =
-              cu::rope_freqs<DataType, traditional.value, forward.value>;
-          uint3 dims =
-              make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
-          dims.z = (dims.z + 3) / 4;
-          auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
-          encoder.add_kernel_node(
-              kernel,
-              grid,
-              block,
-              (donated ? out : in).data<DataType>(),
-              out.data<DataType>(),
-              offset.data<int32_t>(),
-              inputs[2].data<float>(),
-              scale_,
-              std::log2(base_),
-              strides,
-              out_strides,
-              in.size() / mat_size,
-              dims,
-              inputs[2].strides(0));
-        } else {
-          auto kernel = cu::rope<DataType, traditional.value, forward.value>;
-          uint3 dims =
-              make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
-          dims.z = (dims.z + 3) / 4;
-          auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
-          encoder.add_kernel_node(
-              kernel,
-              grid,
-              block,
-              (donated ? out : in).data<DataType>(),
-              out.data<DataType>(),
-              offset.data<int32_t>(),
-              scale_,
-              std::log2(base_),
-              strides,
-              out_strides,
-              in.size() / mat_size,
-              dims);
-        }
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_float_types(out.dtype(), "rope", [&](auto type_tag) {
+      dispatch_bool(traditional_, [&](auto traditional) {
+        dispatch_bool(forward_, [&](auto forward) {
+          using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+          if (single && !with_freqs) {
+            auto kernel =
+                cu::rope_single<DataType, traditional.value, forward.value>;
+            uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
+            auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
+            kernel<<<grid, block, 0, stream>>>(
+                (donated ? out : in).data<DataType>(),
+                out.data<DataType>(),
+                offset.data<int32_t>(),
+                scale_,
+                std::log2(base_),
+                mat_size,
+                dims);
+          } else if (single) {
+            auto kernel = cu::
+                rope_single_freqs<DataType, traditional.value, forward.value>;
+            uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
+            auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
+            kernel<<<grid, block, 0, stream>>>(
+                (donated ? out : in).data<DataType>(),
+                out.data<DataType>(),
+                offset.data<int32_t>(),
+                inputs[2].data<float>(),
+                scale_,
+                mat_size,
+                dims,
+                inputs[2].strides(0));
+          } else if (with_freqs) {
+            auto kernel =
+                cu::rope_freqs<DataType, traditional.value, forward.value>;
+            uint3 dims =
+                make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
+            dims.z = (dims.z + 3) / 4;
+            auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
+            kernel<<<grid, block, 0, stream>>>(
+                (donated ? out : in).data<DataType>(),
+                out.data<DataType>(),
+                offset.data<int32_t>(),
+                inputs[2].data<float>(),
+                scale_,
+                std::log2(base_),
+                strides,
+                out_strides,
+                in.size() / mat_size,
+                dims,
+                inputs[2].strides(0));
+          } else {
+            auto kernel = cu::rope<DataType, traditional.value, forward.value>;
+            uint3 dims =
+                make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
+            dims.z = (dims.z + 3) / 4;
+            auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
+            kernel<<<grid, block, 0, stream>>>(
+                (donated ? out : in).data<DataType>(),
+                out.data<DataType>(),
+                offset.data<int32_t>(),
+                scale_,
+                std::log2(base_),
+                strides,
+                out_strides,
+                in.size() / mat_size,
+                dims);
+          }
+        });
      });
    });
  });
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -43,7 +43,7 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
  // Thread reduce.
  AccT prevmax;
  AccT maxval = Limits<AccT>::finite_min();
-  AccT normalizer = cast_to<AccT>(0);
+  AccT normalizer = 0;
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
    AccT vals[N_READS];
    cub::LoadDirectBlocked(
@@ -141,21 +141,19 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  dispatch_float_types(out.dtype(), "softmax", [&](auto type_tag) {
-    constexpr int N_READS = 4;
-    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      auto kernel = cu::softmax<DataType, DataType, block_dim(), N_READS>;
-      if (precise) {
-        kernel = cu::softmax<DataType, float, block_dim(), N_READS>;
-      }
-      encoder.add_kernel_node(
-          kernel,
-          n_rows,
-          block_dim(),
-          in.data<DataType>(),
-          out.data<DataType>(),
-          axis_size);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_float_types(out.dtype(), "softmax", [&](auto type_tag) {
+      constexpr int N_READS = 4;
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            auto kernel = cu::softmax<DataType, DataType, block_dim(), N_READS>;
+            if (precise) {
+              kernel = cu::softmax<DataType, float, block_dim(), N_READS>;
+            }
+            kernel<<<n_rows, block_dim(), 0, stream>>>(
+                in.data<DataType>(), out.data<DataType>(), axis_size);
+          });
    });
  });
 }
--- a/mlx/backend/cuda/sort.cu
+++ b/mlx/backend/cuda/sort.cu
@@ -50,6 +50,32 @@ array swapaxes_in_eval(const array& in, int axis1, int axis2) {
  return out;
 }

+template <typename... Args>
+void segmented_sort_pairs(cu::CommandEncoder& encoder, Args&&... args) {
+  // Allocate temporary storage.
+  size_t size;
+  CHECK_CUDA_ERROR(
+      cub::DeviceSegmentedSort::StableSortPairs(nullptr, size, args...));
+  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
+  encoder.add_temporary(temp);
+  // Run op.
+  CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
+      temp.data<void>(), size, args...));
+}
+
+template <typename... Args>
+void segmented_sort(cu::CommandEncoder& encoder, Args&&... args) {
+  // Allocate temporary storage.
+  size_t size;
+  CHECK_CUDA_ERROR(
+      cub::DeviceSegmentedSort::StableSortKeys(nullptr, size, args...));
+  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
+  encoder.add_temporary(temp);
+  // Run op.
+  CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
+      temp.data<void>(), size, args...));
+}
+
 struct OffsetTransform {
  int nsort;

@@ -87,94 +113,57 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {

  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  dispatch_all_types(in.dtype(), [&](auto type_tag) {
-    using CTYPE = MLX_GET_TYPE(type_tag);
-    auto& stream = encoder.stream();
-    if constexpr (!std::is_same_v<CTYPE, complex64_t>) {
-      using Type = cuda_type_t<CTYPE>;
-      auto offsets = thrust::make_transform_iterator(
-          thrust::make_counting_iterator(0), OffsetTransform{nsort});
-      if (argsort) {
-        // Indices in the sorted dimension.
-        array indices(allocator::malloc(out.nbytes()), in.shape(), out.dtype());
-        encoder.add_temporary(indices);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(in.dtype(), [&](auto type_tag) {
+      using CTYPE = MLX_GET_TYPE(type_tag);
+      if constexpr (!std::is_same_v<CTYPE, complex64_t>) {
+        using Type = cuda_type_t<CTYPE>;
+        auto offsets = thrust::make_transform_iterator(
+            thrust::make_counting_iterator(0), OffsetTransform{nsort});
+        if (argsort) {
+          // Indices in the sorted dimension.
+          array indices(
+              allocator::malloc(out.nbytes()), in.shape(), out.dtype());
+          encoder.add_temporary(indices);
+          thrust::transform(
+              cu::thrust_policy(stream),
+              thrust::counting_iterator<uint32_t>(0),
+              thrust::counting_iterator<uint32_t>(indices.data_size()),
+              thrust::device_pointer_cast(indices.data<uint32_t>()),
+              ModOp<uint32_t>{static_cast<uint32_t>(nsort)});

-        // In argsort though we don't need the result of sorted values, the
-        // API requires us to provide an array to store it.
-        array discard(allocator::malloc(in.nbytes()), in.shape(), in.dtype());
-        encoder.add_temporary(discard);
+          // In argsort though we don't need the result of sorted values, the
+          // API requires us to provide an array to store it.
+          array discard(allocator::malloc(in.nbytes()), in.shape(), in.dtype());
+          encoder.add_temporary(discard);

-        size_t size;
-        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
-            nullptr,
-            size,
-            in.data<Type>(),
-            discard.data<Type>(),
-            indices.data<uint32_t>(),
-            out.data<uint32_t>(),
-            in.data_size(),
-            in.data_size() / nsort,
-            offsets,
-            offsets + 1,
-            stream));
-
-        array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
-        encoder.add_temporary(temp);
-
-        // Start capturing after allocations
-        auto capture = encoder.capture_context();
-        thrust::transform(
-            cu::thrust_policy(stream),
-            thrust::counting_iterator<uint32_t>(0),
-            thrust::counting_iterator<uint32_t>(indices.data_size()),
-            thrust::device_pointer_cast(indices.data<uint32_t>()),
-            ModOp<uint32_t>{static_cast<uint32_t>(nsort)});
-
-        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
-            temp.data<void>(),
-            size,
-            in.data<Type>(),
-            discard.data<Type>(),
-            indices.data<uint32_t>(),
-            out.data<uint32_t>(),
-            in.data_size(),
-            in.data_size() / nsort,
-            offsets,
-            offsets + 1,
-            stream));
+          segmented_sort_pairs(
+              encoder,
+              in.data<Type>(),
+              discard.data<Type>(),
+              indices.data<uint32_t>(),
+              out.data<uint32_t>(),
+              in.data_size(),
+              in.data_size() / nsort,
+              offsets,
+              offsets + 1,
+              stream);
+        } else {
+          segmented_sort(
+              encoder,
+              in.data<Type>(),
+              out.data<Type>(),
+              in.data_size(),
+              in.data_size() / nsort,
+              offsets,
+              offsets + 1,
+              stream);
+        }
      } else {
-        size_t size;
-        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
-            nullptr,
-            size,
-            in.data<Type>(),
-            out.data<Type>(),
-            in.data_size(),
-            in.data_size() / nsort,
-            offsets,
-            offsets + 1,
-            stream));
-
-        array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
-        encoder.add_temporary(temp);
-
-        // Start capturing after allocations
-        auto capture = encoder.capture_context();
-        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
-            temp.data<void>(),
-            size,
-            in.data<Type>(),
-            out.data<Type>(),
-            in.data_size(),
-            in.data_size() / nsort,
-            offsets,
-            offsets + 1,
-            stream));
+        throw std::runtime_error(
+            "CUDA backend does not support sorting complex numbers");
      }
-    } else {
-      throw std::runtime_error(
-          "CUDA backend does not support sorting complex numbers");
-    }
+    });
  });

  if (!is_segmented_sort) {
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -91,80 +91,73 @@ void ternary_op_gpu_inplace(
  encoder.set_input_array(b);
  encoder.set_input_array(c);
  encoder.set_output_array(out);
-  dispatch_all_types(out.dtype(), [&](auto type_tag) {
-    using DType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(out.dtype(), [&](auto type_tag) {
+      using DType = cuda_type_t<MLX_GET_TYPE(type_tag)>;

-    auto topt = get_ternary_op_type(a, b, c);
-    if (topt == TernaryOpType::General) {
-      dispatch_bool(
-          a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
-              c.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-          [&](auto large) {
-            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-            Shape shape;
-            std::vector<Strides> strides;
-            std::tie(shape, strides) = collapse_contiguous_dims(a, b, c, out);
-            auto& a_strides = strides[0];
-            auto& b_strides = strides[1];
-            auto& c_strides = strides[2];
-            int ndim = shape.size();
-            if (ndim <= 3) {
-              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto kernel =
-                    cu::ternary_g_nd<Op, DType, IdxT, dims_constant()>;
+      auto topt = get_ternary_op_type(a, b, c);
+      if (topt == TernaryOpType::General) {
+        dispatch_bool(
+            a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                c.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+            [&](auto large) {
+              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+              Shape shape;
+              std::vector<Strides> strides;
+              std::tie(shape, strides) = collapse_contiguous_dims(a, b, c, out);
+              auto& a_strides = strides[0];
+              auto& b_strides = strides[1];
+              auto& c_strides = strides[2];
+              int ndim = shape.size();
+              if (ndim <= 3) {
+                dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                  auto kernel =
+                      cu::ternary_g_nd<Op, DType, IdxT, dims_constant()>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out, large());
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      a.data<bool>(),
+                      b.data<DType>(),
+                      c.data<DType>(),
+                      out.data<DType>(),
+                      out.size(),
+                      const_param<dims_constant()>(shape),
+                      const_param<dims_constant()>(a_strides),
+                      const_param<dims_constant()>(b_strides),
+                      const_param<dims_constant()>(c_strides));
+                });
+              } else {
+                auto kernel = cu::ternary_g<Op, DType, IdxT>;
                auto [num_blocks, block_dims] =
                    get_launch_args(kernel, out, large());
-                encoder.add_kernel_node(
-                    kernel,
-                    num_blocks,
-                    block_dims,
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
                    a.data<bool>(),
                    b.data<DType>(),
                    c.data<DType>(),
                    out.data<DType>(),
-                    out.size(),
-                    const_param<dims_constant()>(shape),
-                    const_param<dims_constant()>(a_strides),
-                    const_param<dims_constant()>(b_strides),
-                    const_param<dims_constant()>(c_strides));
-              });
-            } else {
-              auto kernel = cu::ternary_g<Op, DType, IdxT>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large());
-              encoder.add_kernel_node(
-                  kernel,
-                  num_blocks,
-                  block_dims,
-                  a.data<bool>(),
-                  b.data<DType>(),
-                  c.data<DType>(),
-                  out.data<DType>(),
-                  out.data_size(),
-                  const_param(shape),
-                  const_param(a_strides),
-                  const_param(b_strides),
-                  const_param(c_strides),
-                  ndim);
-            }
-          });
-    } else {
-      dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
-        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-        auto kernel = cu::ternary_v<Op, DType, IdxT>;
-        auto [num_blocks, block_dims] = get_launch_args(
-            kernel, out.data_size(), out.shape(), out.strides(), large());
-        encoder.add_kernel_node(
-            kernel,
-            num_blocks,
-            block_dims,
-            a.data<bool>(),
-            b.data<DType>(),
-            c.data<DType>(),
-            out.data<DType>(),
-            out.data_size());
-      });
-    }
+                    out.data_size(),
+                    const_param(shape),
+                    const_param(a_strides),
+                    const_param(b_strides),
+                    const_param(c_strides),
+                    ndim);
+              }
+            });
+      } else {
+        dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+          using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+          auto kernel = cu::ternary_v<Op, DType, IdxT>;
+          auto [num_blocks, block_dims] = get_launch_args(
+              kernel, out.data_size(), out.shape(), out.strides(), large());
+          kernel<<<num_blocks, block_dims, 0, stream>>>(
+              a.data<bool>(),
+              b.data<DType>(),
+              c.data<DType>(),
+              out.data<DType>(),
+              out.data_size());
+        });
+      }
+    });
  });
 }

--- a/mlx/backend/cuda/unary.cu
+++ b/mlx/backend/cuda/unary.cu
@@ -9,38 +9,14 @@
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"

-#include <cooperative_groups.h>
 #include <nvtx3/nvtx3.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>

 namespace mlx::core {

 namespace cu {

-namespace cg = cooperative_groups;
-
-template <typename Op, typename In, typename Out, typename IdxT>
-__global__ void unary_v(const In* in, Out* out, IdxT size) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(in[index]);
-  }
-}
-
-template <typename Op, typename In, typename Out, typename IdxT>
-__global__ void unary_g(
-    const In* in,
-    Out* out,
-    IdxT size,
-    const __grid_constant__ Shape shape,
-    const __grid_constant__ Strides strides,
-    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto idx = elem_to_loc_4d(index, shape.data(), strides.data(), ndim);
-    out[index] = Op{}(in[idx]);
-  }
-}
-
 template <typename Op, typename In, typename Out>
 constexpr bool supports_unary_op() {
  if (std::is_same_v<Op, Abs> || std::is_same_v<Op, Negative> ||
@@ -95,61 +71,38 @@ void unary_op_gpu_inplace(
  if (in.size() == 0) {
    return;
  }
-  bool contig = in.flags().contiguous;
-  bool large;
-  if (!contig) {
-    large = in.data_size() > INT32_MAX || out.size() > INT32_MAX;
-  } else {
-    large = in.data_size() > UINT32_MAX;
-  }

  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
-      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
-      if constexpr (cu::supports_unary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
-        dispatch_bool(large, [&](auto large) {
-          using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+        using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+        using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+        if constexpr (cu::supports_unary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
          using InType = cuda_type_t<CTYPE_IN>;
          using OutType = cuda_type_t<CTYPE_OUT>;
-          using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-          if (contig) {
-            auto kernel = cu::unary_v<Op, InType, OutType, IdxT>;
-            auto [num_blocks, block_dims] = get_launch_args(
-                kernel, out.data_size(), out.shape(), out.strides(), large);
-            encoder.add_kernel_node(
-                kernel,
-                num_blocks,
-                block_dims,
-                in.data<InType>(),
-                out.data<OutType>(),
-                out.data_size());
+          auto policy = cu::thrust_policy(stream);
+          auto in_ptr = thrust::device_pointer_cast(in.data<InType>());
+          auto out_ptr = thrust::device_pointer_cast(out.data<OutType>());
+          if (in.flags().contiguous) {
+            thrust::transform(
+                policy, in_ptr, in_ptr + in.data_size(), out_ptr, Op());
          } else {
            auto [shape, strides] = collapse_contiguous_dims(in);
-            auto kernel = cu::unary_g<Op, InType, OutType, IdxT>;
-            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-            encoder.add_kernel_node(
-                kernel,
-                num_blocks,
-                block_dims,
-                in.data<InType>(),
-                out.data<OutType>(),
-                out.data_size(),
-                const_param(shape),
-                const_param(strides),
-                shape.size());
+            auto [in_begin, in_end] = cu::make_general_iterators<int64_t>(
+                in_ptr, in.size(), shape, strides);
+            thrust::transform(policy, in_begin, in_end, out_ptr, Op());
          }
-        });
-      } else {
-        throw std::runtime_error(fmt::format(
-            "Can not do unary op {} on input of {} with output of {}.",
-            op,
-            dtype_to_string(in.dtype()),
-            dtype_to_string(out.dtype())));
-      }
+        } else {
+          throw std::runtime_error(fmt::format(
+              "Can not do unary op {} on input of {} with output of {}.",
+              op,
+              dtype_to_string(in.dtype()),
+              dtype_to_string(out.dtype())));
+        }
+      });
    });
  });
 }
--- a/mlx/backend/cuda/utils.cpp
+++ b/mlx/backend/cuda/utils.cpp
@@ -24,14 +24,6 @@ void check_cuda_error(const char* name, cudaError_t err) {
  }
 }

-void check_cuda_error(const char* name, CUresult err) {
-  if (err != CUDA_SUCCESS) {
-    const char* err_str = "Unknown error";
-    cuGetErrorString(err, &err_str);
-    throw std::runtime_error(fmt::format("{} failed: {}", name, err_str));
-  }
-}
-
 const char* dtype_to_cuda_type(const Dtype& dtype) {
  switch (dtype) {
    case bool_:
--- a/mlx/backend/cuda/utils.h
+++ b/mlx/backend/cuda/utils.h
@@ -4,7 +4,6 @@

 #pragma once

-#include <cuda.h>
 #include <cuda_runtime.h>

 namespace mlx::core {
@@ -34,7 +33,6 @@ class CudaStream {

 // Throw exception if the cuda API does not succeed.
 void check_cuda_error(const char* name, cudaError_t err);
-void check_cuda_error(const char* name, CUresult err);

 // The macro version that prints the command that failed.
 #define CHECK_CUDA_ERROR(cmd) check_cuda_error(#cmd, (cmd))
--- a/mlx/backend/metal/indexing.cpp
+++ b/mlx/backend/metal/indexing.cpp
@@ -1,5 +1,6 @@
 // Copyright © 2023-2024 Apple Inc.
 #include <fmt/format.h>
+#include <iostream>

 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/common/utils.h"
--- a/mlx/backend/metal/kernels/layer_norm.metal
+++ b/mlx/backend/metal/kernels/layer_norm.metal
@@ -31,7 +31,6 @@ inline void threadgroup_sum(
  for (int i = 0; i < N; i++) {
    x[i] = simd_sum(x[i]);
  }
-  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_lane_id == 0) {
    for (int i = 0; i < N; i++) {
      xs[N * simd_group_id + i] = x[i];
--- a/mlx/linalg.cpp
+++ b/mlx/linalg.cpp
@@ -688,7 +688,7 @@ array solve(const array& a, const array& b, StreamOrDevice s /* = {} */) {
    perm = expand_dims(perm, -1, s);
    take_axis -= 1;
  }
-  auto pb = take_along_axis(b, perm, take_axis, s);
+  auto pb = take_along_axis(b, perm, take_axis);
  auto y = solve_triangular(luf[1], pb, /* upper = */ false, s);
  return solve_triangular(luf[2], y, /* upper = */ true, s);
 }
--- a/mlx/ops.cpp
+++ b/mlx/ops.cpp
@@ -4676,11 +4676,6 @@ array segmented_mm(
    throw std::invalid_argument(msg.str());
  }

-  if (!issubdtype(segments.dtype(), integer)) {
-    throw std::invalid_argument(
-        "[segmented_mm] Got segments with invalid dtype. Segments must be integral.");
-  }
-
  a = astype(a, out_type, s);
  b = astype(b, out_type, s);
  segments = astype(segments, uint32, s);
--- a/mlx/primitives.cpp
+++ b/mlx/primitives.cpp
@@ -3233,9 +3233,8 @@ std::vector<array> QuantizedMatmul::vjp(
          "[QuantizedMatmul::vjp] no gradient wrt the quantized weights.");
    } else {
      if (!dsb) {
-        int ndim = primals[1].ndim();
-        auto fc = flatten(cotangents[0], 0, -ndim, stream());
-        auto fx = flatten(primals[0], 0, -ndim, stream());
+        auto fc = flatten(cotangents[0], 0, -2, stream());
+        auto fx = flatten(primals[0], 0, -2, stream());
        auto dw = transpose_
            ? matmul(swapaxes(fc, -1, -2, stream()), fx, stream())
            : matmul(swapaxes(fx, -1, -2, stream()), fc, stream());
@@ -3389,16 +3388,12 @@ std::vector<array> GatherQMM::vjp(
        vjps.push_back(
            sum(multiply(
                    *dsb,
-                    unflatten(
-                        dequantize(
-                            w,
-                            ones_like(scales, stream()),
-                            zeros_like(biases, stream()),
-                            group_size_,
-                            bits_,
-                            stream()),
-                        -1,
-                        {-1, group_size_},
+                    dequantize(
+                        w,
+                        ones_like(scales, stream()),
+                        zeros_like(biases, stream()),
+                        group_size_,
+                        bits_,
                        stream()),
                    stream()),
                -1,
--- a/mlx/version.h
+++ b/mlx/version.h
@@ -4,7 +4,7 @@

 #define MLX_VERSION_MAJOR 0
 #define MLX_VERSION_MINOR 26
-#define MLX_VERSION_PATCH 3
+#define MLX_VERSION_PATCH 2
 #define MLX_VERSION_NUMERIC \
  (100000 * MLX_VERSION_MAJOR + 1000 * MLX_VERSION_MINOR + MLX_VERSION_PATCH)

--- a/python/mlx/extension.py
+++ b/python/mlx/extension.py
@@ -53,7 +53,11 @@ class CMakeBuild(build_ext):
        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
        # across all generators.
        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
-            build_args += [f"-j{os.cpu_count()}"]
+            # self.parallel is a Python 3 only way to set parallel jobs by hand
+            # using -j in the build_ext call, not supported by pip or PyPA-build.
+            if hasattr(self, "parallel") and self.parallel:
+                # CMake 3.12+ only.
+                build_args += [f"-j{self.parallel}"]

        build_temp = Path(self.build_temp) / ext.name
        if not build_temp.exists():
--- a/python/src/fast.cpp
+++ b/python/src/fast.cpp
@@ -175,12 +175,11 @@ void init_fast(nb::module_& parent_module) {
        * `Grouped Query Attention <https://arxiv.org/abs/2305.13245>`_
        * `Multi-Query Attention <https://arxiv.org/abs/1911.02150>`_

-        .. note::
+        Note: The softmax operation is performed in ``float32`` regardless of
+        the input precision.

-          * The softmax operation is performed in ``float32`` regardless of
-            the input precision.
-          * For Grouped Query Attention and Multi-Query Attention, the ``k``
-            and ``v`` inputs should not be pre-tiled to match ``q``.
+        Note: For Grouped Query Attention and Multi-Query Attention, the ``k``
+        and ``v`` inputs should not be pre-tiled to match ``q``.

        In the following the dimensions are given by:

@@ -196,30 +195,13 @@ void init_fast(nb::module_& parent_module) {
            k (array): Keys with shape ``[B, N_kv, T_kv, D]``.
            v (array): Values with shape ``[B, N_kv, T_kv, D]``.
            scale (float): Scale for queries (typically ``1.0 / sqrt(q.shape(-1)``)
-            mask (Union[None, str, array], optional): The mask to apply to the
-               query-key scores. The mask can be an array or a string indicating
-               the mask type. The only supported string type is ``"causal"``. If
-               the mask is an array it can be a boolean or additive mask. The mask
-               can have at most 4 dimensions and must be broadcast-compatible with
-               the shape ``[B, N, T_q, T_kv]``. If an additive mask is given its
-               type must promote to the promoted type of ``q``, ``k``, and ``v``.
+            mask (Union[None, str, array], optional): A causal, boolean or additive
+               mask to apply to the query-key scores. The mask can have at most 4
+               dimensions and must be broadcast-compatible with the shape
+               ``[B, N, T_q, T_kv]``. If an additive mask is given its type must
+               promote to the promoted type of ``q``, ``k``, and ``v``.
        Returns:
            array: The output array.
-
-        Example:
-
-          .. code-block:: python
-
-            B = 2
-            N_q = N_kv = 32
-            T_q = T_kv = 1000
-            D = 128
-
-            q = mx.random.normal(shape=(B, N_q, T_q, D))
-            k = mx.random.normal(shape=(B, N_kv, T_kv, D))
-            v = mx.random.normal(shape=(B, N_kv, T_kv, D))
-            scale = D ** -0.5
-            out = mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask="causal")
      )pbdoc");

  m.def(
--- a/python/tests/cuda_skip.py
+++ b/python/tests/cuda_skip.py
@@ -79,7 +79,6 @@ cuda_skip = {
    "TestQuantized.test_gather_matmul_grad",
    "TestQuantized.test_gather_qmm",
    "TestQuantized.test_gather_qmm_sorted",
-    "TestQuantized.test_gather_qmm_grad",
    "TestQuantized.test_non_multiples",
    "TestQuantized.test_qmm",
    "TestQuantized.test_qmm_jvp",
--- a/python/tests/test_blas.py
+++ b/python/tests/test_blas.py
@@ -1247,7 +1247,7 @@ class TestBlas(mlx_tests.MLXTestCase):

        a = mx.ones((10, 1000))
        s = mx.random.randint(0, 16, shape=(1000,))
-        s = mx.zeros(16, dtype=s.dtype).at[s].add(1)
+        s = mx.zeros(16).at[s].add(1)
        s = mx.sort(s)
        s = mx.cumsum(s)
        s = mx.concatenate([mx.array([0]), s])
--- a/python/tests/test_load.py
+++ b/python/tests/test_load.py
@@ -391,11 +391,9 @@ class TestLoad(mlx_tests.MLXTestCase):
        scale = mx.array(2.0)
        y = mx.load(save_file)
        mx.eval(y)
-        mx.synchronize()
        load_only = mx.get_peak_memory()
        y = mx.load(save_file) * scale
        mx.eval(y)
-        mx.synchronize()
        load_with_binary = mx.get_peak_memory()

        self.assertEqual(load_only, load_with_binary)
--- a/python/tests/test_quantized.py
+++ b/python/tests/test_quantized.py
@@ -549,49 +549,6 @@ class TestQuantized(mlx_tests.MLXTestCase):
            self.assertTrue(mx.allclose(y1, y3, atol=1e-5))
            self.assertTrue(mx.allclose(y1, y4, atol=1e-5))

-    def test_gather_qmm_grad(self):
-        def gather_qmm_ref(x, w, s, b, lhs, rhs, trans, sort):
-            if lhs is not None:
-                x = x[lhs]
-            if rhs is not None:
-                w = w[rhs]
-                s = s[rhs]
-                b = b[rhs]
-            return mx.quantized_matmul(x, w, s, b, transpose=trans)
-
-        def gather_qmm(x, w, s, b, lhs, rhs, trans, sort):
-            return mx.gather_qmm(
-                x,
-                w,
-                s,
-                b,
-                transpose=trans,
-                lhs_indices=lhs,
-                rhs_indices=rhs,
-                sorted_indices=sort,
-            )
-
-        x = mx.random.normal((16, 1, 256))
-        w, s, b = mx.quantize(mx.random.normal((4, 256, 256)))
-        indices = mx.sort(mx.random.randint(0, 4, shape=(16,)))
-        cotan = mx.random.normal((16, 1, 256))
-
-        (o1,), (dx1, ds1, db1) = mx.vjp(
-            lambda x, s, b: gather_qmm_ref(x, w, s, b, None, indices, True, True),
-            [x, s, b],
-            [cotan],
-        )
-        (o2,), (dx2, ds2, db2) = mx.vjp(
-            lambda x, s, b: gather_qmm(x, w, s, b, None, indices, True, True),
-            [x, s, b],
-            [cotan],
-        )
-
-        self.assertTrue(mx.allclose(o1, o2, atol=1e-4))
-        self.assertTrue(mx.allclose(dx1, dx2, atol=1e-4))
-        self.assertTrue(mx.allclose(ds1, ds2, atol=1e-3))
-        self.assertTrue(mx.allclose(db1, db2, atol=1e-3))
-
    def test_vjp_scales_biases(self):
        mx.random.seed(0)
        x = mx.random.normal(shape=(2, 2, 512))
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,11 @@ class CMakeBuild(build_ext):
        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
        # across all generators.
        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
-            build_args += [f"-j{os.cpu_count()}"]
+            # self.parallel is a Python 3 only way to set parallel jobs by hand
+            # using -j in the build_ext call, not supported by pip or PyPA-build.
+            if hasattr(self, "parallel") and self.parallel:
+                # CMake 3.12+ only.
+                build_args += [f"-j{self.parallel}"]

        build_temp = Path(self.build_temp) / ext.name
        if not build_temp.exists():
Author	SHA1	Message	Date
Angelos Katharopoulos	3d4174cd37	Add gradient for the scales and biases in gather qmm	2025-07-05 00:58:17 -07:00
Angelos Katharopoulos	bda1534a44	Improve the gradient of gather_qmm as well	2025-07-04 20:23:58 -07:00
Angelos Katharopoulos	b28577289e	Disable the test for CUDA	2025-07-04 19:17:45 -07:00
Angelos Katharopoulos	2d0f452aae	Fix the test and cpu edge case	2025-07-04 18:36:20 -07:00
Angelos Katharopoulos	bd0622c4d9	Address floating point exception on linux blas	2025-07-04 13:16:54 -07:00
Angelos Katharopoulos	22f9b8a6fc	More JIT fixes	2025-07-03 17:33:25 -07:00
Angelos Katharopoulos	aaf44d915f	Add no cpu and no gpu implementations where needed	2025-07-03 17:03:35 -07:00
Angelos Katharopoulos	ed1a81210d	Fix the JIT	2025-07-03 15:57:43 -07:00
Angelos Katharopoulos	f2994d5b29	Fix scatter_axis on single axis arrays	2025-07-03 14:50:23 -07:00
Angelos Katharopoulos	06a2e74eb2	Fix metal jit	2025-07-03 14:09:05 -07:00
Angelos Katharopoulos	d96a33c776	Add rudimentary test for gather_mm with sorted indices	2025-07-03 14:02:33 -07:00
Angelos Katharopoulos	4babc035a3	Add a test for segmented_mm	2025-07-03 13:49:46 -07:00
Angelos Katharopoulos	a8d7b74984	Simplify the jacobian as well	2025-07-02 18:23:48 -07:00
Angelos Katharopoulos	a29fa053c6	Use segmented_mm to calculate the MoE gradient	2025-07-02 16:24:21 -07:00
Angelos Katharopoulos	8f771efb82	Add the metal version of segmented mm	2025-07-02 15:45:09 -07:00
Angelos Katharopoulos	3104c3eb14	Change the segments to be more general	2025-07-02 15:45:09 -07:00
Angelos Katharopoulos	6020ad6363	Start the segmented_mm op and CPU primitive	2025-07-02 15:45:09 -07:00