Faster general unary op (#2472 )

* faster general unary op * faster general ops + reorg * fix + comment * binary two * copy general
Rename cu::Matmul to CublasGemm (#2488 )
2025-12-16 01:49:05 +08:00 · 2025-08-15 15:04:12 -07:00 · 2025-08-13 09:37:40 +09:00 · 2025-08-12 12:29:02 -07:00 · 2025-08-12 00:05:33 -07:00 · 2025-08-12 00:03:42 -07:00
77 changed files with 1782 additions and 515 deletions
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,4 +1,5 @@
 sphinx
 breathe
 sphinx-book-theme
+sphinx-copybutton
 mlx
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -18,6 +18,7 @@ release = version
 # -- General configuration ---------------------------------------------------

 extensions = [
+    "sphinx_copybutton",
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "sphinx.ext.intersphinx",
--- a/mlx/backend/cpu/reduce.cpp
+++ b/mlx/backend/cpu/reduce.cpp
@@ -491,19 +491,27 @@ void Reduce::eval_cpu(const std::vector<array>& inputs, array& out) {
        switch (in.dtype()) {
          case bool_:
          case uint8:
+            reduce_dispatch_sum_prod<uint8_t>(in, out, reduce_type_, axes_);
+            break;
+          case uint16:
+            reduce_dispatch_sum_prod<uint16_t>(in, out, reduce_type_, axes_);
+            break;
+          case uint32:
+            reduce_dispatch_sum_prod<uint32_t>(in, out, reduce_type_, axes_);
+            break;
+          case uint64:
+            reduce_dispatch_sum_prod<uint64_t>(in, out, reduce_type_, axes_);
+            break;
          case int8:
            reduce_dispatch_sum_prod<int8_t>(in, out, reduce_type_, axes_);
            break;
          case int16:
-          case uint16:
            reduce_dispatch_sum_prod<int16_t>(in, out, reduce_type_, axes_);
            break;
          case int32:
-          case uint32:
            reduce_dispatch_sum_prod<int32_t>(in, out, reduce_type_, axes_);
            break;
          case int64:
-          case uint64:
            reduce_dispatch_sum_prod<int64_t>(in, out, reduce_type_, axes_);
            break;
          case float16:
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -8,7 +8,6 @@ target_sources(
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/arange.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
@@ -45,18 +44,20 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)

+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)
+
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9.0)
  target_sources(
-    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_batched_gemm_12_9.cu)
+    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm_batched_12_9.cu)
 else()
  target_sources(
-    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_batched_gemm_12_0.cpp)
+    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm_batched_12_0.cpp)
 endif()

 target_compile_definitions(mlx PRIVATE MLX_USE_CUDA)
--- a/mlx/backend/cuda/binary/CMakeLists.txt
+++ b/mlx/backend/cuda/binary/CMakeLists.txt
@@ -0,0 +1,21 @@
+target_sources(
+  mlx
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/add.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arctan2.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bitwise_binary.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/divide.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/equal.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/greater.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/greater_equal.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/less.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/less_equal.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_and.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_or.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/log_add_exp.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/minimum.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/maximum.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/multiply.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/power.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/remainder.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/not_equal.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/subtract.cu)
--- a/mlx/backend/cuda/binary/add.cu
+++ b/mlx/backend/cuda/binary/add.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Add)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/arctan2.cu
+++ b/mlx/backend/cuda/binary/arctan2.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(ArcTan2)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/binary.cuh
+++ b/mlx/backend/cuda/binary/binary.cuh
@@ -99,39 +99,89 @@ __global__ void binary_vv(const In* a, const In* b, Out* out, IdxT size) {
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
+template <
+    typename Op,
+    typename In,
+    typename Out,
+    typename IdxT,
+    int NDIM,
+    int N_READS>
 __global__ void binary_g_nd(
    const In* a,
    const In* b,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
-        index, shape.data(), a_strides.data(), b_strides.data());
-    out[index] = Op{}(a[a_idx], b[b_idx]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[NDIM - 1];
+  auto a_stride_x = a_strides[NDIM - 1];
+  auto b_stride_x = b_strides[NDIM - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
+      index_rest * shape_x, shape.data(), a_strides.data(), b_strides.data());
+  auto a_vec =
+      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
+  auto b_vec =
+      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
+
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = Op{}(a_vec[i], b_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_g(
    const In* a,
    const In* b,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides a_strides,
    const __grid_constant__ Strides b_strides,
    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc(
-        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
-    out[index] = Op{}(a[a_idx], b[b_idx]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[ndim - 1];
+  auto a_stride_x = a_strides[ndim - 1];
+  auto b_stride_x = b_strides[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [a_idx, b_idx] = elem_to_loc(
+      index_rest * shape_x,
+      shape.data(),
+      a_strides.data(),
+      b_strides.data(),
+      ndim);
+  auto a_vec =
+      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
+  auto b_vec =
+      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
+
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = Op{}(a_vec[i], b_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

 template <typename Op, typename In, typename Out>
@@ -209,39 +259,61 @@ void binary_op_gpu_inplace(
                auto& a_strides = strides[0];
                auto& b_strides = strides[1];
                int ndim = shape.size();
+                int work_per_thread = 1;
+                auto dim0 = ndim > 0 ? shape.back() : 1;
+                auto rest = out.size() / dim0;
+                if (dim0 >= 4) {
+                  work_per_thread = 4;
+                }
+                dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+                auto block_dims = get_block_dims(dim0, rest, 1);
+                uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+                uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
                if (ndim <= 3) {
                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                    auto [num_blocks, block_dims] =
-                        get_launch_args(out, large());
+                    auto kernel = cu::binary_g_nd<
+                        Op,
+                        InType,
+                        OutType,
+                        IdxT,
+                        dims_constant(),
+                        1>;
+                    if (work_per_thread == 4) {
+                      kernel = cu::binary_g_nd<
+                          Op,
+                          InType,
+                          OutType,
+                          IdxT,
+                          dims_constant(),
+                          4>;
+                    }
                    encoder.add_kernel_node(
-                        cu::binary_g_nd<
-                            Op,
-                            InType,
-                            OutType,
-                            IdxT,
-                            dims_constant()>,
-                        num_blocks,
+                        kernel,
+                        {num_blocks_x, num_blocks_y},
                        block_dims,
                        0,
                        a.data<InType>(),
                        b.data<InType>(),
                        out.data<OutType>(),
-                        out.size(),
+                        rest,
                        const_param<dims_constant()>(shape),
                        const_param<dims_constant()>(a_strides),
                        const_param<dims_constant()>(b_strides));
                  });
                } else {
-                  auto [num_blocks, block_dims] = get_launch_args(out, large());
+                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT, 1>;
+                  if (work_per_thread == 4) {
+                    kernel = cu::binary_g<Op, InType, OutType, IdxT, 4>;
+                  }
                  encoder.add_kernel_node(
-                      cu::binary_g<Op, InType, OutType, IdxT>,
-                      num_blocks,
+                      kernel,
+                      {num_blocks_x, num_blocks_y},
                      block_dims,
                      0,
                      a.data<InType>(),
                      b.data<InType>(),
                      out.data<OutType>(),
-                      out.size(),
+                      rest,
                      const_param(shape),
                      const_param(a_strides),
                      const_param(b_strides),
@@ -304,54 +376,4 @@ void binary_op_gpu(
    binary_op_gpu<cu::func>(inputs, out, name(), s);                  \
  }

-BINARY_GPU(Add)
-BINARY_GPU(ArcTan2)
-BINARY_GPU(Divide)
-BINARY_GPU(Remainder)
-BINARY_GPU(Greater)
-BINARY_GPU(GreaterEqual)
-BINARY_GPU(Less)
-BINARY_GPU(LessEqual)
-BINARY_GPU(LogicalAnd)
-BINARY_GPU(LogicalOr)
-BINARY_GPU(LogAddExp)
-BINARY_GPU(Maximum)
-BINARY_GPU(Minimum)
-BINARY_GPU(Multiply)
-BINARY_GPU(NotEqual)
-BINARY_GPU(Power)
-BINARY_GPU(Subtract)
-
-void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("Equal::eval_gpu");
-  auto& s = out.primitive().stream();
-  if (equal_nan_) {
-    binary_op_gpu<cu::NaNEqual>(inputs, out, name(), s);
-  } else {
-    binary_op_gpu<cu::Equal>(inputs, out, name(), s);
-  }
-}
-
-void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
-  auto& s = out.primitive().stream();
-  switch (op_) {
-    case BitwiseBinary::And:
-      binary_op_gpu<cu::BitwiseAnd>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::Or:
-      binary_op_gpu<cu::BitwiseOr>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::Xor:
-      binary_op_gpu<cu::BitwiseXor>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::LeftShift:
-      binary_op_gpu<cu::LeftShift>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::RightShift:
-      binary_op_gpu<cu::RightShift>(inputs, out, name(), s);
-      break;
-  }
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cuda/binary/bitwise_binary.cu
+++ b/mlx/backend/cuda/binary/bitwise_binary.cu
@@ -0,0 +1,27 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
+  auto& s = out.primitive().stream();
+  switch (op_) {
+    case BitwiseBinary::And:
+      binary_op_gpu<cu::BitwiseAnd>(inputs, out, name(), s);
+      break;
+    case BitwiseBinary::Or:
+      binary_op_gpu<cu::BitwiseOr>(inputs, out, name(), s);
+      break;
+    case BitwiseBinary::Xor:
+      binary_op_gpu<cu::BitwiseXor>(inputs, out, name(), s);
+      break;
+    case BitwiseBinary::LeftShift:
+      binary_op_gpu<cu::LeftShift>(inputs, out, name(), s);
+      break;
+    case BitwiseBinary::RightShift:
+      binary_op_gpu<cu::RightShift>(inputs, out, name(), s);
+      break;
+  }
+}
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/divide.cu
+++ b/mlx/backend/cuda/binary/divide.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Divide)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/equal.cu
+++ b/mlx/backend/cuda/binary/equal.cu
@@ -0,0 +1,15 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Equal::eval_gpu");
+  auto& s = out.primitive().stream();
+  if (equal_nan_) {
+    binary_op_gpu<cu::NaNEqual>(inputs, out, name(), s);
+  } else {
+    binary_op_gpu<cu::Equal>(inputs, out, name(), s);
+  }
+}
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/greater.cu
+++ b/mlx/backend/cuda/binary/greater.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Greater)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/greater_equal.cu
+++ b/mlx/backend/cuda/binary/greater_equal.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(GreaterEqual)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/less.cu
+++ b/mlx/backend/cuda/binary/less.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Less)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/less_equal.cu
+++ b/mlx/backend/cuda/binary/less_equal.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(LessEqual)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/log_add_exp.cu
+++ b/mlx/backend/cuda/binary/log_add_exp.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(LogAddExp)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/logical_and.cu
+++ b/mlx/backend/cuda/binary/logical_and.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(LogicalAnd)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/logical_or.cu
+++ b/mlx/backend/cuda/binary/logical_or.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(LogicalOr)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/maximum.cu
+++ b/mlx/backend/cuda/binary/maximum.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Maximum)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/minimum.cu
+++ b/mlx/backend/cuda/binary/minimum.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Minimum)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/multiply.cu
+++ b/mlx/backend/cuda/binary/multiply.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Multiply)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/not_equal.cu
+++ b/mlx/backend/cuda/binary/not_equal.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(NotEqual)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/power.cu
+++ b/mlx/backend/cuda/binary/power.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Power)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/remainder.cu
+++ b/mlx/backend/cuda/binary/remainder.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Remainder)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/subtract.cu
+++ b/mlx/backend/cuda/binary/subtract.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Subtract)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -127,45 +127,99 @@ binary_two_vv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
+template <
+    typename Op,
+    typename In,
+    typename Out,
+    typename IdxT,
+    int NDIM,
+    int N_READS>
 __global__ void binary_two_g_nd(
    const In* a,
    const In* b,
    Out* out_a,
    Out* out_b,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
-        index, shape.data(), a_strides.data(), b_strides.data());
-    auto out = Op{}(a[a_idx], b[b_idx]);
-    out_a[index] = out[0];
-    out_b[index] = out[1];
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[NDIM - 1];
+  auto a_stride_x = a_strides[NDIM - 1];
+  auto b_stride_x = b_strides[NDIM - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
+      index_rest * shape_x, shape.data(), a_strides.data(), b_strides.data());
+  auto a_vec =
+      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
+  auto b_vec =
+      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
+
+  AlignedVector<Out, N_READS> out_vec_a;
+  AlignedVector<Out, N_READS> out_vec_b;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    auto out = Op{}(a_vec[i], b_vec[i]);
+    out_vec_a[i] = out[0];
+    out_vec_b[i] = out[1];
+  }
+  store_vector(out_a + shape_x * index_rest, index_x, out_vec_a, shape_x);
+  store_vector(out_b + shape_x * index_rest, index_x, out_vec_b, shape_x);
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_two_g(
    const In* a,
    const In* b,
    Out* out_a,
    Out* out_b,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides a_strides,
    const __grid_constant__ Strides b_strides,
    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc(
-        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
-    auto out = Op{}(a[a_idx], b[b_idx]);
-    out_a[index] = out[0];
-    out_b[index] = out[1];
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[ndim - 1];
+  auto a_stride_x = a_strides[ndim - 1];
+  auto b_stride_x = b_strides[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [a_idx, b_idx] = elem_to_loc(
+      index_rest * shape_x,
+      shape.data(),
+      a_strides.data(),
+      b_strides.data(),
+      ndim);
+  auto a_vec =
+      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
+  auto b_vec =
+      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
+
+  AlignedVector<Out, N_READS> out_vec_a;
+  AlignedVector<Out, N_READS> out_vec_b;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    auto out = Op{}(a_vec[i], b_vec[i]);
+    out_vec_a[i] = out[0];
+    out_vec_b[i] = out[1];
+  }
+  store_vector(out_a + shape_x * index_rest, index_x, out_vec_a, shape_x);
+  store_vector(out_b + shape_x * index_rest, index_x, out_vec_b, shape_x);
 }

 template <typename Op, typename In, typename Out>
@@ -225,42 +279,64 @@ void binary_two_op_gpu_inplace(
                auto& a_strides = strides[0];
                auto& b_strides = strides[1];
                int ndim = shape.size();
+                int work_per_thread = 1;
+                auto dim0 = ndim > 0 ? shape.back() : 1;
+                auto rest = out_a.size() / dim0;
+                if (dim0 >= 4) {
+                  work_per_thread = 4;
+                }
+                dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+                auto block_dims = get_block_dims(dim0, rest, 1);
+                uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+                uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
+
                if (ndim <= 3) {
                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                    auto [num_blocks, block_dims] =
-                        get_launch_args(out_a, large());
+                    auto kernel = cu::binary_two_g_nd<
+                        Op,
+                        InType,
+                        OutType,
+                        IdxT,
+                        dims_constant(),
+                        1>;
+                    if (work_per_thread == 4) {
+                      kernel = cu::binary_two_g_nd<
+                          Op,
+                          InType,
+                          OutType,
+                          IdxT,
+                          dims_constant(),
+                          4>;
+                    }
                    encoder.add_kernel_node(
-                        cu::binary_two_g_nd<
-                            Op,
-                            InType,
-                            OutType,
-                            IdxT,
-                            dims_constant()>,
-                        num_blocks,
+                        kernel,
+                        {num_blocks_x, num_blocks_y},
                        block_dims,
                        0,
                        a.data<InType>(),
                        b.data<InType>(),
                        out_a.data<OutType>(),
                        out_b.data<OutType>(),
-                        out_a.size(),
+                        rest,
                        const_param<dims_constant()>(shape),
                        const_param<dims_constant()>(a_strides),
                        const_param<dims_constant()>(b_strides));
                  });
                } else {
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(out_a, large());
+                  auto kernel = cu::binary_two_g<Op, InType, OutType, IdxT, 1>;
+                  if (work_per_thread == 4) {
+                    kernel = cu::binary_two_g<Op, InType, OutType, IdxT, 4>;
+                  }
                  encoder.add_kernel_node(
-                      cu::binary_two_g<Op, InType, OutType, IdxT>,
-                      num_blocks,
+                      kernel,
+                      {num_blocks_x, num_blocks_y},
                      block_dims,
                      0,
                      a.data<InType>(),
                      b.data<InType>(),
                      out_a.data<OutType>(),
                      out_b.data<OutType>(),
-                      out_a.size(),
+                      rest,
                      const_param(shape),
                      const_param(a_strides),
                      const_param(b_strides),
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -10,37 +10,80 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename In, typename Out, typename IdxT, int NDIM>
+template <typename In, typename Out, typename IdxT, int NDIM, int N_READS>
 __global__ void copy_gg_nd(
    const In* in,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
-        index, shape.data(), strides_in.data(), strides_out.data());
-    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[NDIM - 1];
+  auto in_stride_x = strides_in[NDIM - 1];
+  auto out_stride_x = strides_out[NDIM - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
+      index_rest * shape_x,
+      shape.data(),
+      strides_in.data(),
+      strides_out.data());
+
+  auto in_vec =
+      load_vector<N_READS>(in + idx_in, index_x, shape_x, in_stride_x, In(0));
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
+  }
+  store_vector(out + idx_out, index_x, out_vec, shape_x, out_stride_x);
 }

-template <typename In, typename Out, typename IdxT>
+template <typename In, typename Out, typename IdxT, int N_READS>
 __global__ void copy_gg(
    const In* in,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides_in,
    const __grid_constant__ Strides strides_out,
    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [idx_in, idx_out] = elem_to_loc(
-        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
-    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[ndim - 1];
+  auto in_stride_x = strides_in[ndim - 1];
+  auto out_stride_x = strides_out[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [idx_in, idx_out] = elem_to_loc(
+      index_rest * shape_x,
+      shape.data(),
+      strides_in.data(),
+      strides_out.data(),
+      ndim);
+
+  auto in_vec =
+      load_vector<N_READS>(in + idx_in, index_x, shape_x, in_stride_x, In(0));
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
+  }
+  store_vector(out + idx_out, index_x, out_vec, shape_x, out_stride_x);
 }

 } // namespace cu
@@ -69,33 +112,52 @@ void copy_general(
            size_t data_size = 1;
            for (auto& s : shape)
              data_size *= s;
+
+            int work_per_thread = 1;
+            auto dim0 = ndim > 0 ? shape.back() : 1;
+            auto rest = data_size / dim0;
+            if (dim0 >= 4) {
+              work_per_thread = 4;
+            }
+
+            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+            auto block_dims = get_block_dims(dim0, rest, 1);
+            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
+
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto ndim_constant) {
-                auto [num_blocks, block_dims] =
-                    get_launch_args(data_size, shape, out.strides(), large());
+                auto kernel =
+                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant(), 1>;
+                if (work_per_thread == 4) {
+                  kernel =
+                      cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant(), 4>;
+                }
                encoder.add_kernel_node(
-                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>,
-                    num_blocks,
+                    kernel,
+                    {num_blocks_x, num_blocks_y},
                    block_dims,
                    0,
                    in_ptr,
                    out_ptr,
-                    data_size,
+                    rest,
                    const_param<ndim_constant()>(shape),
                    const_param<ndim_constant()>(strides_in),
                    const_param<ndim_constant()>(strides_out));
              });
            } else { // ndim >= 4
-              auto [num_blocks, block_dims] =
-                  get_launch_args(data_size, shape, out.strides(), large());
+              auto kernel = cu::copy_gg<InType, OutType, IdxT, 1>;
+              if (work_per_thread == 4) {
+                kernel = cu::copy_gg<InType, OutType, IdxT, 4>;
+              }
              encoder.add_kernel_node(
-                  cu::copy_gg<InType, OutType, IdxT>,
-                  num_blocks,
+                  kernel,
+                  {num_blocks_x, num_blocks_y},
                  block_dims,
                  0,
                  in_ptr,
                  out_ptr,
-                  data_size,
+                  rest,
                  const_param(shape),
                  const_param(strides_in),
                  const_param(strides_out),
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -10,33 +10,67 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename In, typename Out, typename IdxT, int NDIM>
+template <typename In, typename Out, typename IdxT, int NDIM, int N_READS>
 __global__ void copy_g_nd(
    const In* in,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    IdxT idx_in = elem_to_loc_nd<NDIM>(index, shape.data(), strides_in.data());
-    out[index] = CastOp<In, Out>{}(in[idx_in]);
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides) {
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[NDIM - 1];
+  auto stride_x = strides[NDIM - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto idx =
+      elem_to_loc_nd<NDIM>(index_rest * shape_x, shape.data(), strides.data());
+  auto in_vec =
+      load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

-template <typename In, typename Out, typename IdxT>
+template <typename In, typename Out, typename IdxT, int N_READS>
 __global__ void copy_g(
    const In* in,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ Shape shape,
-    const __grid_constant__ Strides strides_in,
+    const __grid_constant__ Strides strides,
    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    IdxT idx_in = elem_to_loc(index, shape.data(), strides_in.data(), ndim);
-    out[index] = CastOp<In, Out>{}(in[idx_in]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[ndim - 1];
+  auto stride_x = strides[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto idx =
+      elem_to_loc(index_rest * shape_x, shape.data(), strides.data(), ndim);
+  auto in_vec =
+      load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

 } // namespace cu
@@ -61,30 +95,49 @@ void copy_general_input(
            const InType* in_ptr = in.data<InType>() + offset_in;
            OutType* out_ptr = out.data<OutType>() + offset_out;
            int ndim = shape.size();
+            int work_per_thread = 1;
+            auto dim0 = ndim > 0 ? shape.back() : 1;
+            auto rest = out.size() / dim0;
+            if (dim0 >= 4) {
+              work_per_thread = 4;
+            }
+            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+            auto block_dims = get_block_dims(dim0, rest, 1);
+            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
+
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto [num_blocks, block_dims] = get_launch_args(out, large());
+                auto kernel =
+                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 1>;
+                if (work_per_thread == 4) {
+                  kernel =
+                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 4>;
+                }
                encoder.add_kernel_node(
-                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>,
-                    num_blocks,
+                    kernel,
+                    {num_blocks_x, num_blocks_y},
                    block_dims,
                    0,
                    in_ptr,
                    out_ptr,
-                    out.size(),
+                    rest,
                    const_param<dims_constant()>(shape),
                    const_param<dims_constant()>(strides_in));
              });
            } else { // ndim >= 4
-              auto [num_blocks, block_dims] = get_launch_args(out, large());
+              auto kernel = cu::copy_g<InType, OutType, IdxT, 1>;
+              if (work_per_thread == 4) {
+                kernel = cu::copy_g<InType, OutType, IdxT, 4>;
+              }
              encoder.add_kernel_node(
-                  cu::copy_g<InType, OutType, IdxT>,
-                  num_blocks,
+                  kernel,
+                  {num_blocks_x, num_blocks_y},
                  block_dims,
                  0,
                  in_ptr,
                  out_ptr,
-                  out.size(),
+                  rest,
                  const_param(shape),
                  const_param(strides_in),
                  ndim);
--- a/mlx/backend/cuda/device/utils.cuh
+++ b/mlx/backend/cuda/device/utils.cuh
@@ -146,6 +146,23 @@ inline __device__ void store_vector(
  }
 }

+template <int N, typename T, typename SizeT>
+inline __device__ void store_vector(
+    T* ptr,
+    uint32_t offset,
+    const AlignedVector<T, N>& vec,
+    SizeT size,
+    int64_t stride) {
+  if (is_aligned<N>(ptr) && (offset + 1) * N <= size && stride == 1) {
+    auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
+    to[offset] = vec;
+  } else {
+    for (int i = 0; (offset * N + i) < size && i < N; ++i) {
+      ptr[stride * (offset * N + i)] = vec[i];
+    }
+  }
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Type limits utils
 ///////////////////////////////////////////////////////////////////////////////
--- a/mlx/backend/cuda/gemms/cublas_batched_gemm_12_9.cu
+++ b/mlx/backend/cuda/gemms/cublas_batched_gemm_12_9.cu
@@ -1,208 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/gemms/cublas_gemm.h"
-#include "mlx/backend/cuda/kernel_utils.cuh"
-
-#include <cooperative_groups.h>
-
-namespace mlx::core::cu {
-
-namespace cg = cooperative_groups;
-
-__global__ void set_mm_device_pointers(
-    int8_t** pointers,
-    int8_t* a_start,
-    int8_t* b_start,
-    int8_t* out_start,
-    int item_size,
-    const __grid_constant__ Shape batch_shape,
-    const __grid_constant__ Strides a_batch_strides,
-    const __grid_constant__ Strides b_batch_strides,
-    int64_t batch_stride,
-    int batch_ndim,
-    int batch_count) {
-  auto index = cg::this_grid().thread_rank();
-  if (index >= batch_count) {
-    return;
-  }
-  auto [a_offset, b_offset] = elem_to_loc(
-      index,
-      batch_shape.data(),
-      a_batch_strides.data(),
-      b_batch_strides.data(),
-      batch_ndim);
-  pointers[index] = a_start + item_size * a_offset;
-  pointers[index + batch_count] = b_start + item_size * b_offset;
-  pointers[index + 2 * batch_count] =
-      out_start + item_size * index * batch_stride;
-}
-
-__global__ void set_addmm_device_pointers(
-    int8_t** pointers,
-    int8_t* a_start,
-    int8_t* b_start,
-    int8_t* c_start,
-    int8_t* out_start,
-    int item_size,
-    const __grid_constant__ Shape batch_shape,
-    const __grid_constant__ Strides a_batch_strides,
-    const __grid_constant__ Strides b_batch_strides,
-    const __grid_constant__ Strides c_batch_strides,
-    int64_t batch_stride,
-    int batch_ndim,
-    int batch_count) {
-  auto index = cg::this_grid().thread_rank();
-  if (index >= batch_count) {
-    return;
-  }
-  auto [a_offset, b_offset, c_offset] = elem_to_loc(
-      index,
-      batch_shape.data(),
-      a_batch_strides.data(),
-      b_batch_strides.data(),
-      c_batch_strides.data(),
-      batch_ndim);
-  pointers[index] = a_start + item_size * a_offset;
-  pointers[index + batch_count] = b_start + item_size * b_offset;
-  pointers[index + 2 * batch_count] = c_start + item_size * c_offset;
-  pointers[index + 3 * batch_count] =
-      out_start + item_size * index * batch_stride;
-}
-
-void set_pointer_mode(cublasLtMatrixLayout_t desc, int batch_count) {
-  auto batch_mode = CUBLASLT_BATCH_MODE_POINTER_ARRAY;
-  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
-      desc,
-      CUBLASLT_MATRIX_LAYOUT_BATCH_MODE,
-      &batch_mode,
-      sizeof(batch_mode)));
-  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
-      desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, sizeof(int32_t)));
-}
-
-void Matmul::run_batched(
-    cu::CommandEncoder& encoder,
-    array& out,
-    const array& a,
-    const array& b,
-    const mlx::core::Shape& batch_shape,
-    const mlx::core::Strides& a_batch_strides,
-    const mlx::core::Strides& b_batch_strides) {
-  auto batch_count = out.size() / (M_ * N_);
-  set_pointer_mode(a_desc_, batch_count);
-  set_pointer_mode(b_desc_, batch_count);
-  set_pointer_mode(out_desc_, batch_count);
-
-  // Launch kernel to set device offsets
-  auto pointers = array(
-      allocator::malloc(batch_count * sizeof(uint64_t) * 3),
-      {static_cast<int>(batch_count * 3)},
-      uint64);
-
-  encoder.add_temporary(pointers);
-  int block_size = 512;
-  encoder.set_output_array(pointers);
-
-  encoder.add_kernel_node(
-      cu::set_mm_device_pointers,
-      cuda::ceil_div(pointers.size(), block_size),
-      block_size,
-      0,
-      pointers.data<int8_t*>(),
-      a.data<int8_t>(),
-      b.data<int8_t>(),
-      out.data<int8_t>(),
-      static_cast<int>(out.dtype().size()),
-      const_param(batch_shape),
-      const_param(a_batch_strides),
-      const_param(b_batch_strides),
-      static_cast<int64_t>(M_) * N_,
-      static_cast<int>(batch_shape.size()),
-      batch_count);
-
-  // Run matmul
-  encoder.set_input_array(pointers);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-
-  auto a_pointers = pointers.data<int8_t*>();
-  auto b_pointers = a_pointers + batch_count;
-  auto out_pointers = b_pointers + batch_count;
-  run_impl(
-      encoder,
-      reinterpret_cast<void*>(out_pointers),
-      reinterpret_cast<void*>(a_pointers),
-      reinterpret_cast<void*>(b_pointers),
-      nullptr);
-}
-
-void Matmul::run_batched(
-    cu::CommandEncoder& encoder,
-    array& out,
-    const array& a,
-    const array& b,
-    const array& c,
-    const mlx::core::Shape& batch_shape,
-    const mlx::core::Strides& a_batch_strides,
-    const mlx::core::Strides& b_batch_strides,
-    const mlx::core::Strides& c_batch_strides,
-    float alpha,
-    float beta) {
-  auto batch_count = out.size() / (M_ * N_);
-  set_pointer_mode(a_desc_, batch_count);
-  set_pointer_mode(b_desc_, batch_count);
-  set_pointer_mode(c_desc_, batch_count);
-  set_pointer_mode(out_desc_, batch_count);
-
-  // Launch kernel to set device offsets
-  auto pointers = array(
-      allocator::malloc(batch_count * sizeof(uint64_t) * 4),
-      {static_cast<int>(batch_count * 4)},
-      uint64);
-
-  encoder.add_temporary(pointers);
-  int block_size = 512;
-  encoder.set_output_array(pointers);
-  encoder.add_kernel_node(
-      cu::set_addmm_device_pointers,
-      cuda::ceil_div(pointers.size(), block_size),
-      block_size,
-      0,
-      pointers.data<int8_t*>(),
-      a.data<int8_t>(),
-      b.data<int8_t>(),
-      c.data<int8_t>(),
-      out.data<int8_t>(),
-      static_cast<int>(out.dtype().size()),
-      const_param(batch_shape),
-      const_param(a_batch_strides),
-      const_param(b_batch_strides),
-      const_param(c_batch_strides),
-      static_cast<int64_t>(M_) * N_,
-      static_cast<int>(batch_shape.size()),
-      batch_count);
-
-  // Run matmul
-  encoder.set_input_array(pointers);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_input_array(c);
-  encoder.set_output_array(out);
-
-  auto a_pointers = pointers.data<int8_t*>();
-  auto b_pointers = a_pointers + batch_count;
-  auto c_pointers = b_pointers + batch_count;
-  auto out_pointers = c_pointers + batch_count;
-  run_impl(
-      encoder,
-      reinterpret_cast<void*>(out_pointers),
-      reinterpret_cast<void*>(a_pointers),
-      reinterpret_cast<void*>(b_pointers),
-      reinterpret_cast<void*>(c_pointers),
-      alpha,
-      beta);
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/gemms/cublas_gemm.cpp
+++ b/mlx/backend/cuda/gemms/cublas_gemm.cpp
@@ -7,10 +7,12 @@

 #include <fmt/format.h>

-namespace mlx::core::cu {
+namespace mlx::core {
+
+namespace {

 struct CublasPreference {
-  CublasPreference(Device& device) {
+  CublasPreference(cu::Device& device) {
    // The recommended cublas workspace size is 4 MiB for pre-Hopper and 32 MiB
    // for Hopper+:
    // https://docs.nvidia.com/cuda/cublas/#cublassetworkspace
@@ -33,7 +35,7 @@ struct CublasPreference {
  cublasLtMatmulPreference_t pref_{nullptr};
 };

-cublasLtMatmulPreference_t cublas_preference(Device& device) {
+cublasLtMatmulPreference_t cublas_preference(cu::Device& device) {
  static CublasPreference pref(device);
  return pref.pref_;
 }
@@ -52,7 +54,7 @@ cublasComputeType_t dtype_to_compute_type(Dtype dtype) {
      return CUBLAS_COMPUTE_64F;
    default:
      throw std::runtime_error(fmt::format(
-          "Unsupported dtype in Matmul: {}.", dtype_to_string(dtype)));
+          "Unsupported dtype in CublasGemm: {}.", dtype_to_string(dtype)));
  }
 }

@@ -70,7 +72,7 @@ cudaDataType_t dtype_to_cublas_type(Dtype dtype) {
      return CUDA_C_32F;
    default:
      throw std::runtime_error(fmt::format(
-          "Unsupported dtype in Matmul: {}.", dtype_to_string(dtype)));
+          "Unsupported dtype in CublasGemm: {}.", dtype_to_string(dtype)));
  }
 }

@@ -102,8 +104,10 @@ cublasLtMatrixLayout_t create_matrix_layout(
  return desc;
 }

-Matmul::Matmul(
-    Device& device,
+} // namespace
+
+CublasGemm::CublasGemm(
+    cu::Device& device,
    Dtype dtype,
    bool a_transposed,
    uint64_t a_rows,
@@ -155,8 +159,8 @@ Matmul::Matmul(
      type, a_rows, b_cols, false, b_cols, batch_count, a_rows * b_cols);
 }

-Matmul::Matmul(
-    Device& device,
+CublasGemm::CublasGemm(
+    cu::Device& device,
    Dtype dtype,
    bool a_transposed,
    uint64_t a_rows,
@@ -171,7 +175,7 @@ Matmul::Matmul(
    int64_t a_batch_stride,
    int64_t b_batch_stride,
    int64_t c_batch_stride)
-    : Matmul(
+    : CublasGemm(
          device,
          dtype,
          a_transposed,
@@ -190,7 +194,7 @@ Matmul::Matmul(
      type, a_rows, b_cols, false, ldc, batch_count, c_batch_stride);
 }

-Matmul::~Matmul() {
+CublasGemm::~CublasGemm() {
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutDestroy(a_desc_));
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutDestroy(b_desc_));
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutDestroy(c_desc_));
@@ -198,7 +202,73 @@ Matmul::~Matmul() {
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescDestroy(matmul_desc_));
 }

-void Matmul::run_impl(
+void CublasGemm::run(
+    cu::CommandEncoder& encoder,
+    array& out,
+    const array& a,
+    const array& b,
+    const Shape& batch_shape,
+    const Strides& a_batch_strides,
+    const Strides& b_batch_strides) {
+  int batch_count = out.size() / (M_ * N_);
+  if (batch_count / batch_shape.back() > 1) {
+    run_batched(
+        encoder, out, a, b, batch_shape, a_batch_strides, b_batch_strides);
+    return;
+  }
+
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+
+  execute(encoder, out.data<void>(), a.data<void>(), b.data<void>(), nullptr);
+}
+
+void CublasGemm::run(
+    cu::CommandEncoder& encoder,
+    array& out,
+    const array& a,
+    const array& b,
+    const array& c,
+    const Shape& batch_shape,
+    const Strides& a_batch_strides,
+    const Strides& b_batch_strides,
+    const Strides& c_batch_strides,
+    float alpha,
+    float beta) {
+  int batch_count = out.size() / (M_ * N_);
+  if (batch_count / batch_shape.back() > 1) {
+    run_batched(
+        encoder,
+        out,
+        a,
+        b,
+        c,
+        batch_shape,
+        a_batch_strides,
+        b_batch_strides,
+        c_batch_strides,
+        alpha,
+        beta);
+    return;
+  }
+
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_input_array(c);
+  encoder.set_output_array(out);
+
+  execute(
+      encoder,
+      out.data<void>(),
+      a.data<void>(),
+      b.data<void>(),
+      c.data<void>(),
+      alpha,
+      beta);
+}
+
+void CublasGemm::execute(
    cu::CommandEncoder& encoder,
    void* out,
    const void* a,
@@ -256,29 +326,4 @@ void Matmul::run_impl(
      encoder.stream()));
 }

-void Matmul::run(
-    cu::CommandEncoder& encoder,
-    array& out,
-    const array& a,
-    const array& b,
-    const std::optional<array>& c /* = std::nullopt */,
-    float alpha /* = 1 */,
-    float beta /* = 0 */) {
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  if (c) {
-    encoder.set_input_array(*c);
-  }
-  encoder.set_output_array(out);
-
-  run_impl(
-      encoder,
-      out.data<void>(),
-      a.data<void>(),
-      b.data<void>(),
-      c ? c->data<void>() : nullptr,
-      alpha,
-      beta);
-}
-
-} // namespace mlx::core::cu
+} // namespace mlx::core
--- a/mlx/backend/cuda/gemms/cublas_gemm.h
+++ b/mlx/backend/cuda/gemms/cublas_gemm.h
@@ -5,13 +5,13 @@
 #include "mlx/backend/cuda/device.h"

 #include <cublasLt.h>
-#include <optional>

-namespace mlx::core::cu {
-class Matmul {
+namespace mlx::core {
+
+class CublasGemm {
 public:
-  Matmul(
-      Device& device,
+  CublasGemm(
+      cu::Device& device,
      Dtype dtype,
      bool a_transposed,
      uint64_t a_rows,
@@ -25,8 +25,8 @@ class Matmul {
      int64_t a_batch_stride,
      int64_t b_batch_stride);

-  Matmul(
-      Device& device,
+  CublasGemm(
+      cu::Device& device,
      Dtype dtype,
      bool a_transposed,
      uint64_t a_rows,
@@ -42,25 +42,39 @@ class Matmul {
      int64_t b_batch_stride,
      int64_t c_batch_stride);

-  ~Matmul();
+  ~CublasGemm();

  void run(
      cu::CommandEncoder& encoder,
      array& out,
      const array& a,
      const array& b,
-      const std::optional<array>& c = std::nullopt,
-      float alpha = 1,
-      float beta = 0);
+      const Shape& batch_shape,
+      const Strides& a_batch_strides,
+      const Strides& b_batch_strides);

+  void run(
+      cu::CommandEncoder& encoder,
+      array& out,
+      const array& a,
+      const array& b,
+      const array& c,
+      const Shape& batch_shape,
+      const Strides& a_batch_strides,
+      const Strides& b_batch_strides,
+      const Strides& c_batch_strides,
+      float alpha,
+      float beta);
+
+ private:
  void run_batched(
      cu::CommandEncoder& encoder,
      array& out,
      const array& a,
      const array& b,
-      const mlx::core::Shape& batch_shape,
-      const mlx::core::Strides& a_batch_strides,
-      const mlx::core::Strides& b_batch_strides);
+      const Shape& batch_shape,
+      const Strides& a_batch_strides,
+      const Strides& b_batch_strides);

  void run_batched(
      cu::CommandEncoder& encoder,
@@ -68,15 +82,14 @@ class Matmul {
      const array& a,
      const array& b,
      const array& c,
-      const mlx::core::Shape& batch_shape,
-      const mlx::core::Strides& a_batch_strides,
-      const mlx::core::Strides& b_batch_strides,
-      const mlx::core::Strides& c_batch_strides,
+      const Shape& batch_shape,
+      const Strides& a_batch_strides,
+      const Strides& b_batch_strides,
+      const Strides& c_batch_strides,
      float alpha,
      float beta);

- private:
-  void run_impl(
+  void execute(
      cu::CommandEncoder& encoder,
      void* out,
      const void* a,
@@ -97,4 +110,4 @@ class Matmul {
  cublasLtMatmulHeuristicResult_t heuristic_;
 };

-} // namespace mlx::core::cu
+} // namespace mlx::core
--- a/mlx/backend/cuda/gemms/cublas_gemm_batched_12_0.cpp
+++ b/mlx/backend/cuda/gemms/cublas_gemm_batched_12_0.cpp
@@ -4,16 +4,16 @@
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/gemms/cublas_gemm.h"

-namespace mlx::core::cu {
+namespace mlx::core {

-void Matmul::run_batched(
+void CublasGemm::run_batched(
    cu::CommandEncoder& encoder,
    array& out,
    const array& a,
    const array& b,
-    const mlx::core::Shape& batch_shape,
-    const mlx::core::Strides& a_batch_strides,
-    const mlx::core::Strides& b_batch_strides) {
+    const Shape& batch_shape,
+    const Strides& a_batch_strides,
+    const Strides& b_batch_strides) {
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
@@ -22,7 +22,7 @@ void Matmul::run_batched(
  ContiguousIterator b_it(batch_shape, b_batch_strides, batch_shape.size() - 1);
  auto concurrent = encoder.concurrent_context();
  for (size_t i = 0; i < nbatch; ++i) {
-    run_impl(
+    execute(
        encoder,
        out.data<int8_t>() + out.itemsize() * i * batch_shape.back() * M_ * N_,
        a.data<int8_t>() + a.itemsize() * a_it.loc,
@@ -33,16 +33,16 @@ void Matmul::run_batched(
  }
 }

-void Matmul::run_batched(
+void CublasGemm::run_batched(
    cu::CommandEncoder& encoder,
    array& out,
    const array& a,
    const array& b,
    const array& c,
-    const mlx::core::Shape& batch_shape,
-    const mlx::core::Strides& a_batch_strides,
-    const mlx::core::Strides& b_batch_strides,
-    const mlx::core::Strides& c_batch_strides,
+    const Shape& batch_shape,
+    const Strides& a_batch_strides,
+    const Strides& b_batch_strides,
+    const Strides& c_batch_strides,
    float alpha,
    float beta) {
  encoder.set_input_array(a);
@@ -56,7 +56,7 @@ void Matmul::run_batched(
  ContiguousIterator c_it(batch_shape, c_batch_strides, batch_shape.size() - 1);
  auto concurrent = encoder.concurrent_context();
  for (size_t i = 0; i < nbatch; ++i) {
-    run_impl(
+    execute(
        encoder,
        out.data<int8_t>() + out.itemsize() * i * batch_shape.back() * M_ * N_,
        a.data<int8_t>() + a.itemsize() * a_it.loc,
@@ -70,4 +70,4 @@ void Matmul::run_batched(
  }
 }

-} // namespace mlx::core::cu
+} // namespace mlx::core
--- a/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
+++ b/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
@@ -0,0 +1,327 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/gemms/cublas_gemm.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <int NDIM>
+__global__ void set_mm_device_pointers_nd(
+    int8_t** pointers,
+    int8_t* a_start,
+    int8_t* b_start,
+    int8_t* out_start,
+    int item_size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> batch_shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_batch_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_batch_strides,
+    int64_t batch_stride,
+    int batch_count) {
+  auto index = cg::this_grid().thread_rank();
+  if (index >= batch_count) {
+    return;
+  }
+  auto [a_offset, b_offset] = elem_to_loc_nd<NDIM>(
+      index,
+      batch_shape.data(),
+      a_batch_strides.data(),
+      b_batch_strides.data());
+  pointers[index] = a_start + item_size * a_offset;
+  pointers[index + batch_count] = b_start + item_size * b_offset;
+  pointers[index + 2 * batch_count] =
+      out_start + item_size * index * batch_stride;
+}
+
+__global__ void set_mm_device_pointers_g(
+    int8_t** pointers,
+    int8_t* a_start,
+    int8_t* b_start,
+    int8_t* out_start,
+    int item_size,
+    const __grid_constant__ Shape batch_shape,
+    const __grid_constant__ Strides a_batch_strides,
+    const __grid_constant__ Strides b_batch_strides,
+    int64_t batch_stride,
+    int batch_ndim,
+    int batch_count) {
+  auto index = cg::this_grid().thread_rank();
+  if (index >= batch_count) {
+    return;
+  }
+  auto [a_offset, b_offset] = elem_to_loc(
+      index,
+      batch_shape.data(),
+      a_batch_strides.data(),
+      b_batch_strides.data(),
+      batch_ndim);
+  pointers[index] = a_start + item_size * a_offset;
+  pointers[index + batch_count] = b_start + item_size * b_offset;
+  pointers[index + 2 * batch_count] =
+      out_start + item_size * index * batch_stride;
+}
+
+template <int NDIM>
+__global__ void set_addmm_device_pointers_nd(
+    int8_t** pointers,
+    int8_t* a_start,
+    int8_t* b_start,
+    int8_t* c_start,
+    int8_t* out_start,
+    int item_size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> batch_shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_batch_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_batch_strides,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> c_batch_strides,
+    int64_t batch_stride,
+    int batch_count) {
+  auto index = cg::this_grid().thread_rank();
+  if (index >= batch_count) {
+    return;
+  }
+  auto [a_offset, b_offset, c_offset] = elem_to_loc_nd<NDIM>(
+      index,
+      batch_shape.data(),
+      a_batch_strides.data(),
+      b_batch_strides.data(),
+      c_batch_strides.data());
+  pointers[index] = a_start + item_size * a_offset;
+  pointers[index + batch_count] = b_start + item_size * b_offset;
+  pointers[index + 2 * batch_count] = c_start + item_size * c_offset;
+  pointers[index + 3 * batch_count] =
+      out_start + item_size * index * batch_stride;
+}
+
+__global__ void set_addmm_device_pointers_g(
+    int8_t** pointers,
+    int8_t* a_start,
+    int8_t* b_start,
+    int8_t* c_start,
+    int8_t* out_start,
+    int item_size,
+    const __grid_constant__ Shape batch_shape,
+    const __grid_constant__ Strides a_batch_strides,
+    const __grid_constant__ Strides b_batch_strides,
+    const __grid_constant__ Strides c_batch_strides,
+    int64_t batch_stride,
+    int batch_ndim,
+    int batch_count) {
+  auto index = cg::this_grid().thread_rank();
+  if (index >= batch_count) {
+    return;
+  }
+  auto [a_offset, b_offset, c_offset] = elem_to_loc(
+      index,
+      batch_shape.data(),
+      a_batch_strides.data(),
+      b_batch_strides.data(),
+      c_batch_strides.data(),
+      batch_ndim);
+  pointers[index] = a_start + item_size * a_offset;
+  pointers[index + batch_count] = b_start + item_size * b_offset;
+  pointers[index + 2 * batch_count] = c_start + item_size * c_offset;
+  pointers[index + 3 * batch_count] =
+      out_start + item_size * index * batch_stride;
+}
+
+} // namespace cu
+
+namespace {
+
+void set_pointer_mode(cublasLtMatrixLayout_t desc, int batch_count) {
+  auto batch_mode = CUBLASLT_BATCH_MODE_POINTER_ARRAY;
+  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
+      desc,
+      CUBLASLT_MATRIX_LAYOUT_BATCH_MODE,
+      &batch_mode,
+      sizeof(batch_mode)));
+  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
+      desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, sizeof(int32_t)));
+}
+
+} // namespace
+
+void CublasGemm::run_batched(
+    cu::CommandEncoder& encoder,
+    array& out,
+    const array& a,
+    const array& b,
+    const Shape& batch_shape,
+    const Strides& a_batch_strides,
+    const Strides& b_batch_strides) {
+  int batch_count = out.size() / (M_ * N_);
+  set_pointer_mode(a_desc_, batch_count);
+  set_pointer_mode(b_desc_, batch_count);
+  set_pointer_mode(out_desc_, batch_count);
+
+  // Launch kernel to set device offsets
+  auto pointers = array(
+      allocator::malloc(batch_count * sizeof(void*) * 3),
+      {batch_count * 3},
+      uint64);
+
+  encoder.add_temporary(pointers);
+  encoder.set_output_array(pointers);
+
+  int block_dims = std::min(batch_count, 256);
+  int num_blocks = cuda::ceil_div(batch_count, block_dims);
+  int64_t batch_stride = M_ * N_;
+  int item_size = out.itemsize();
+
+  int ndim = batch_shape.size();
+  if (ndim <= 3) {
+    dispatch_1_2_3(ndim, [&](auto ndim_constant) {
+      encoder.add_kernel_node(
+          cu::set_mm_device_pointers_nd<ndim_constant()>,
+          num_blocks,
+          block_dims,
+          0,
+          pointers.data<int8_t*>(),
+          a.data<int8_t>(),
+          b.data<int8_t>(),
+          out.data<int8_t>(),
+          item_size,
+          const_param<ndim_constant()>(batch_shape),
+          const_param<ndim_constant()>(a_batch_strides),
+          const_param<ndim_constant()>(b_batch_strides),
+          batch_stride,
+          batch_count);
+    });
+  } else {
+    encoder.add_kernel_node(
+        cu::set_mm_device_pointers_g,
+        num_blocks,
+        block_dims,
+        0,
+        pointers.data<int8_t*>(),
+        a.data<int8_t>(),
+        b.data<int8_t>(),
+        out.data<int8_t>(),
+        item_size,
+        const_param(batch_shape),
+        const_param(a_batch_strides),
+        const_param(b_batch_strides),
+        batch_stride,
+        ndim,
+        batch_count);
+  }
+
+  // Run matmul
+  encoder.set_input_array(pointers);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+
+  auto a_pointers = pointers.data<int8_t*>();
+  auto b_pointers = a_pointers + batch_count;
+  auto out_pointers = b_pointers + batch_count;
+  execute(
+      encoder,
+      reinterpret_cast<void*>(out_pointers),
+      reinterpret_cast<void*>(a_pointers),
+      reinterpret_cast<void*>(b_pointers),
+      nullptr);
+}
+
+void CublasGemm::run_batched(
+    cu::CommandEncoder& encoder,
+    array& out,
+    const array& a,
+    const array& b,
+    const array& c,
+    const Shape& batch_shape,
+    const Strides& a_batch_strides,
+    const Strides& b_batch_strides,
+    const Strides& c_batch_strides,
+    float alpha,
+    float beta) {
+  int batch_count = out.size() / (M_ * N_);
+  set_pointer_mode(a_desc_, batch_count);
+  set_pointer_mode(b_desc_, batch_count);
+  set_pointer_mode(c_desc_, batch_count);
+  set_pointer_mode(out_desc_, batch_count);
+
+  // Launch kernel to set device offsets
+  auto pointers = array(
+      allocator::malloc(batch_count * sizeof(uint64_t) * 4),
+      {batch_count * 4},
+      uint64);
+
+  encoder.add_temporary(pointers);
+  encoder.set_output_array(pointers);
+
+  int block_dims = std::min(batch_count, 256);
+  int num_blocks = cuda::ceil_div(batch_count, block_dims);
+  int64_t batch_stride = M_ * N_;
+  int item_size = out.itemsize();
+
+  int ndim = batch_shape.size();
+  if (ndim <= 3) {
+    dispatch_1_2_3(ndim, [&](auto ndim_constant) {
+      encoder.add_kernel_node(
+          cu::set_addmm_device_pointers_nd<ndim_constant()>,
+          num_blocks,
+          block_dims,
+          0,
+          pointers.data<int8_t*>(),
+          a.data<int8_t>(),
+          b.data<int8_t>(),
+          c.data<int8_t>(),
+          out.data<int8_t>(),
+          item_size,
+          const_param<ndim_constant()>(batch_shape),
+          const_param<ndim_constant()>(a_batch_strides),
+          const_param<ndim_constant()>(b_batch_strides),
+          const_param<ndim_constant()>(c_batch_strides),
+          batch_stride,
+          batch_count);
+    });
+  } else {
+    encoder.add_kernel_node(
+        cu::set_addmm_device_pointers_g,
+        num_blocks,
+        block_dims,
+        0,
+        pointers.data<int8_t*>(),
+        a.data<int8_t>(),
+        b.data<int8_t>(),
+        c.data<int8_t>(),
+        out.data<int8_t>(),
+        item_size,
+        const_param(batch_shape),
+        const_param(a_batch_strides),
+        const_param(b_batch_strides),
+        const_param(c_batch_strides),
+        batch_stride,
+        ndim,
+        batch_count);
+  }
+
+  // Run matmul
+  encoder.set_input_array(pointers);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_input_array(c);
+  encoder.set_output_array(out);
+
+  auto a_pointers = pointers.data<int8_t*>();
+  auto b_pointers = a_pointers + batch_count;
+  auto c_pointers = b_pointers + batch_count;
+  auto out_pointers = c_pointers + batch_count;
+  execute(
+      encoder,
+      reinterpret_cast<void*>(out_pointers),
+      reinterpret_cast<void*>(a_pointers),
+      reinterpret_cast<void*>(b_pointers),
+      reinterpret_cast<void*>(c_pointers),
+      alpha,
+      beta);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/matmul.cpp
+++ b/mlx/backend/cuda/matmul.cpp
@@ -97,7 +97,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {

  /////////////////////////////////////////////////////////////////////////////
  // Invoke cublasLt
-  cu::Matmul matmul(
+  CublasGemm gemm(
      cu::device(s.device),
      a.dtype(),
      a_transposed,
@@ -111,14 +111,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
      batch_shape.back(),
      a_batch_strides.back(),
      b_batch_strides.back());
-
-  if ((batch_count / batch_shape.back()) == 1) {
-    matmul.run(encoder, out, a, b);
-    return;
-  }
-
-  matmul.run_batched(
-      encoder, out, a, b, batch_shape, a_batch_strides, b_batch_strides);
+  gemm.run(encoder, out, a, b, batch_shape, a_batch_strides, b_batch_strides);
 }

 void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -186,7 +179,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  /////////////////////////////////////////////////////////////////////////////
  // Invoke cublasLt

-  cu::Matmul matmul(
+  CublasGemm gemm(
      cu::device(s.device),
      a.dtype(),
      a_transposed,
@@ -202,12 +195,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
      a_batch_strides.back(),
      b_batch_strides.back(),
      c_batch_strides.back());
-
-  if ((batch_count / batch_shape.back()) == 1) {
-    matmul.run(encoder, out, a, b, c, alpha_, beta_);
-    return;
-  }
-  matmul.run_batched(
+  gemm.run(
      encoder,
      out,
      a,
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -39,52 +39,98 @@ ternary_v(const bool* a, const T* b, const T* c, T* out, IdxT size) {
  }
 }

-template <typename Op, typename T, typename IdxT, int NDIM>
+template <typename Op, typename T, typename IdxT, int NDIM, int N_READS>
 __global__ void ternary_g_nd(
    const bool* a,
    const T* b,
    const T* c,
    T* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> c_strides) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx, c_idx] = elem_to_loc_nd<NDIM>(
-        index,
-        shape.data(),
-        a_strides.data(),
-        b_strides.data(),
-        c_strides.data());
-    out[index] = Op{}(a[a_idx], b[b_idx], c[c_idx]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[NDIM - 1];
+  auto a_stride_x = a_strides[NDIM - 1];
+  auto b_stride_x = b_strides[NDIM - 1];
+  auto c_stride_x = c_strides[NDIM - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [a_idx, b_idx, c_idx] = elem_to_loc_nd<NDIM>(
+      index_rest * shape_x,
+      shape.data(),
+      a_strides.data(),
+      b_strides.data(),
+      c_strides.data());
+  auto a_vec =
+      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, false);
+  auto b_vec =
+      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, T(0));
+  auto c_vec =
+      load_vector<N_READS>(c + c_idx, index_x, shape_x, c_stride_x, T(0));
+
+  AlignedVector<T, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = Op{}(a_vec[i], b_vec[i], c_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

-template <typename Op, typename T, typename IdxT>
+template <typename Op, typename T, typename IdxT, int N_READS>
 __global__ void ternary_g(
    const bool* a,
    const T* b,
    const T* c,
    T* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides a_strides,
    const __grid_constant__ Strides b_strides,
    const __grid_constant__ Strides c_strides,
    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx, c_idx] = elem_to_loc(
-        index,
-        shape.data(),
-        a_strides.data(),
-        b_strides.data(),
-        c_strides.data(),
-        ndim);
-    out[index] = Op{}(a[a_idx], b[b_idx], c[c_idx]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[ndim - 1];
+  auto a_stride_x = a_strides[ndim - 1];
+  auto b_stride_x = b_strides[ndim - 1];
+  auto c_stride_x = c_strides[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [a_idx, b_idx, c_idx] = elem_to_loc(
+      index_rest * shape_x,
+      shape.data(),
+      a_strides.data(),
+      b_strides.data(),
+      c_strides.data(),
+      ndim);
+  auto a_vec =
+      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, false);
+  auto b_vec =
+      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, T(0));
+  auto c_vec =
+      load_vector<N_READS>(c + c_idx, index_x, shape_x, c_stride_x, T(0));
+
+  AlignedVector<T, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = Op{}(a_vec[i], b_vec[i], c_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

 } // namespace cu
@@ -123,36 +169,55 @@ void ternary_op_gpu_inplace(
            auto& b_strides = strides[1];
            auto& c_strides = strides[2];
            int ndim = shape.size();
+            int work_per_thread = 1;
+            auto dim0 = ndim > 0 ? shape.back() : 1;
+            auto rest = out.size() / dim0;
+            if (dim0 >= 4) {
+              work_per_thread = 4;
+            }
+            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+            auto block_dims = get_block_dims(dim0, rest, 1);
+            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
+
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto [num_blocks, block_dims] = get_launch_args(out, large());
+                auto kernel =
+                    cu::ternary_g_nd<Op, DType, IdxT, dims_constant(), 1>;
+                if (work_per_thread == 4) {
+                  kernel =
+                      cu::ternary_g_nd<Op, DType, IdxT, dims_constant(), 4>;
+                }
                encoder.add_kernel_node(
-                    cu::ternary_g_nd<Op, DType, IdxT, dims_constant()>,
-                    num_blocks,
+                    kernel,
+                    {num_blocks_x, num_blocks_y},
                    block_dims,
                    0,
                    a.data<bool>(),
                    b.data<DType>(),
                    c.data<DType>(),
                    out.data<DType>(),
-                    out.size(),
+                    rest,
                    const_param<dims_constant()>(shape),
                    const_param<dims_constant()>(a_strides),
                    const_param<dims_constant()>(b_strides),
                    const_param<dims_constant()>(c_strides));
              });
            } else {
-              auto [num_blocks, block_dims] = get_launch_args(out, large());
+              auto kernel = cu::ternary_g<Op, DType, IdxT, 1>;
+              if (work_per_thread == 4) {
+                kernel = cu::ternary_g<Op, DType, IdxT, 4>;
+              }
              encoder.add_kernel_node(
-                  cu::ternary_g<Op, DType, IdxT>,
-                  num_blocks,
+                  kernel,
+                  {num_blocks_x, num_blocks_y},
                  block_dims,
                  0,
                  a.data<bool>(),
                  b.data<DType>(),
                  c.data<DType>(),
                  out.data<DType>(),
-                  out.data_size(),
+                  rest,
                  const_param(shape),
                  const_param(a_strides),
                  const_param(b_strides),
--- a/mlx/backend/cuda/unary.cu
+++ b/mlx/backend/cuda/unary.cu
@@ -37,19 +37,36 @@ __global__ void unary_v(const In* in, Out* out, IdxT size) {
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void unary_g(
    const In* in,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides,
    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto idx = elem_to_loc(index, shape.data(), strides.data(), ndim);
-    out[index] = Op{}(in[idx]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[ndim - 1];
+  auto stride_x = strides[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto idx =
+      elem_to_loc(index_rest * shape_x, shape.data(), strides.data(), ndim);
+  auto in_vec =
+      load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = Op{}(in_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

 template <typename Op, typename In, typename Out>
@@ -127,8 +144,7 @@ void unary_op_gpu_inplace(
          using OutType = cuda_type_t<CTYPE_OUT>;
          if (contig) {
            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-            // TODO: Choose optimized value based on type size.
-            constexpr int N_READS = 4;
+            constexpr int N_READS = 16 / sizeof(OutType);
            auto [num_blocks, block_dims] = get_launch_args(
                out.data_size(), out.shape(), out.strides(), large, N_READS);
            encoder.add_kernel_node(
@@ -142,18 +158,30 @@ void unary_op_gpu_inplace(
          } else {
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
            auto [shape, strides] = collapse_contiguous_dims(in);
-            auto [num_blocks, block_dims] = get_launch_args(out, large);
+            auto ndim = shape.size();
+            int work_per_thread = 1;
+            auto kernel = cu::unary_g<Op, InType, OutType, IdxT, 1>;
+            auto dim0 = ndim > 0 ? shape.back() : 1;
+            auto rest = out.size() / dim0;
+            if (dim0 >= 4) {
+              kernel = cu::unary_g<Op, InType, OutType, IdxT, 4>;
+              work_per_thread = 4;
+            }
+            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+            auto block_dims = get_block_dims(dim0, rest, 1);
+            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
            encoder.add_kernel_node(
-                cu::unary_g<Op, InType, OutType, IdxT>,
-                num_blocks,
+                kernel,
+                {num_blocks_x, num_blocks_y},
                block_dims,
                0,
                in.data<InType>(),
                out.data<OutType>(),
-                out.data_size(),
+                rest,
                const_param(shape),
                const_param(strides),
-                shape.size());
+                ndim);
          }
        });
      } else {
--- a/mlx/backend/cuda/unary/CMakeLists.txt
+++ b/mlx/backend/cuda/unary/CMakeLists.txt
@@ -0,0 +1,34 @@
+target_sources(
+  mlx
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/abs.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arccos.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arccosh.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arcsin.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arcsinh.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arctan.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arctanh.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bitwise_invert.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ceil.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/conjugate.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cos.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cosh.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/erf.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/erf_inv.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/exp.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/expm1.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/floor.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/imag.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/log.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/log1p.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_not.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/negative.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/real.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/round.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sigmoid.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sign.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sin.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sinh.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sqrt.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/square.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tan.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tanh.cu)
--- a/mlx/backend/cuda/unary/abs.cu
+++ b/mlx/backend/cuda/unary/abs.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Abs)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/arccos.cu
+++ b/mlx/backend/cuda/unary/arccos.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(ArcCos)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/arccosh.cu
+++ b/mlx/backend/cuda/unary/arccosh.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(ArcCosh)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/arcsin.cu
+++ b/mlx/backend/cuda/unary/arcsin.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(ArcSin)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/arcsinh.cu
+++ b/mlx/backend/cuda/unary/arcsinh.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(ArcSinh)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/arctan.cu
+++ b/mlx/backend/cuda/unary/arctan.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(ArcTan)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/arctanh.cu
+++ b/mlx/backend/cuda/unary/arctanh.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(ArcTanh)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/bitwise_invert.cu
+++ b/mlx/backend/cuda/unary/bitwise_invert.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(BitwiseInvert)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/ceil.cu
+++ b/mlx/backend/cuda/unary/ceil.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Ceil)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/conjugate.cu
+++ b/mlx/backend/cuda/unary/conjugate.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Conjugate)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/cos.cu
+++ b/mlx/backend/cuda/unary/cos.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Cos)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/cosh.cu
+++ b/mlx/backend/cuda/unary/cosh.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Cosh)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/erf.cu
+++ b/mlx/backend/cuda/unary/erf.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Erf)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/erf_inv.cu
+++ b/mlx/backend/cuda/unary/erf_inv.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(ErfInv)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/exp.cu
+++ b/mlx/backend/cuda/unary/exp.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Exp)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/expm1.cu
+++ b/mlx/backend/cuda/unary/expm1.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Expm1)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/floor.cu
+++ b/mlx/backend/cuda/unary/floor.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Floor)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/imag.cu
+++ b/mlx/backend/cuda/unary/imag.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Imag)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/log.cu
+++ b/mlx/backend/cuda/unary/log.cu
@@ -0,0 +1,21 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+void Log::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Log::eval_gpu");
+  auto& s = out.primitive().stream();
+  switch (base_) {
+    case Base::e:
+      unary_op_gpu<cu::Log>(inputs, out, name(), s);
+      break;
+    case Base::two:
+      unary_op_gpu<cu::Log2>(inputs, out, name(), s);
+      break;
+    case Base::ten:
+      unary_op_gpu<cu::Log10>(inputs, out, name(), s);
+      break;
+  }
+}
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/log1p.cu
+++ b/mlx/backend/cuda/unary/log1p.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Log1p)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/logical_not.cu
+++ b/mlx/backend/cuda/unary/logical_not.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(LogicalNot)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/negative.cu
+++ b/mlx/backend/cuda/unary/negative.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Negative)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/real.cu
+++ b/mlx/backend/cuda/unary/real.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Real)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/round.cu
+++ b/mlx/backend/cuda/unary/round.cu
@@ -0,0 +1,18 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+void Round::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Round::eval_gpu");
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  auto& s = out.primitive().stream();
+  if (issubdtype(in.dtype(), inexact)) {
+    unary_op_gpu<cu::Round>(inputs, out, name(), s);
+  } else {
+    // No-op integer types
+    out.copy_shared_buffer(in);
+  }
+}
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/sigmoid.cu
+++ b/mlx/backend/cuda/unary/sigmoid.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Sigmoid)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/sign.cu
+++ b/mlx/backend/cuda/unary/sign.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Sign)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/sin.cu
+++ b/mlx/backend/cuda/unary/sin.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Sin)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/sinh.cu
+++ b/mlx/backend/cuda/unary/sinh.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Sinh)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/sqrt.cu
+++ b/mlx/backend/cuda/unary/sqrt.cu
@@ -0,0 +1,15 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+void Sqrt::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Sqrt::eval_gpu");
+  auto& s = out.primitive().stream();
+  if (recip_) {
+    unary_op_gpu<cu::Rsqrt>(inputs, out, "Rsqrt", s);
+  } else {
+    unary_op_gpu<cu::Sqrt>(inputs, out, "Sqrt", s);
+  }
+}
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/square.cu
+++ b/mlx/backend/cuda/unary/square.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Square)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/tan.cu
+++ b/mlx/backend/cuda/unary/tan.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Tan)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/tanh.cu
+++ b/mlx/backend/cuda/unary/tanh.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/unary/unary.cuh"
+
+namespace mlx::core {
+UNARY_GPU(Tanh)
+} // namespace mlx::core
--- a/mlx/backend/cuda/unary/unary.cuh
+++ b/mlx/backend/cuda/unary/unary.cuh
@@ -0,0 +1,215 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/unary.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/unary_ops.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <cooperative_groups.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void unary_v(const In* in, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(in[i]);
+    }
+  } else {
+    auto in_vec = load_vector<N_READS>(in, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec[i] = Op{}(in_vec[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void unary_g(
+    const In* in,
+    Out* out,
+    IdxT size_rest,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides,
+    int ndim) {
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
+  }
+
+  auto shape_x = shape[ndim - 1];
+  auto stride_x = strides[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto idx =
+      elem_to_loc(index_rest * shape_x, shape.data(), strides.data(), ndim);
+  auto in_vec =
+      load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = Op{}(in_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
+}
+
+template <typename Op, typename In, typename Out>
+constexpr bool supports_unary_op() {
+  if (std::is_same_v<Op, Abs> || std::is_same_v<Op, Negative> ||
+      std::is_same_v<Op, Sign> || std::is_same_v<Op, Square>) {
+    return std::is_same_v<In, Out>;
+  }
+  if (std::is_same_v<Op, ArcCosh> || std::is_same_v<Op, ArcSinh> ||
+      std::is_same_v<Op, ArcTanh> || std::is_same_v<Op, Erf> ||
+      std::is_same_v<Op, ErfInv> || std::is_same_v<Op, Expm1> ||
+      std::is_same_v<Op, Sigmoid>) {
+    return std::is_same_v<In, Out> && is_floating_v<In>;
+  }
+  if (std::is_same_v<Op, BitwiseInvert>) {
+    return std::is_same_v<In, Out> && std::is_integral_v<In> &&
+        !std::is_same_v<In, bool>;
+  }
+  if (std::is_same_v<Op, Ceil> || std::is_same_v<Op, Floor>) {
+    return std::is_same_v<In, Out> && !mlx::core::is_complex_v<In>;
+  }
+  if (std::is_same_v<Op, Conjugate>) {
+    return std::is_same_v<In, Out> && mlx::core::is_complex_v<In>;
+  }
+  if (std::is_same_v<Op, ArcCos> || std::is_same_v<Op, ArcSin> ||
+      std::is_same_v<Op, ArcTan> || std::is_same_v<Op, Cos> ||
+      std::is_same_v<Op, Cosh> || std::is_same_v<Op, Exp> ||
+      std::is_same_v<Op, Log> || std::is_same_v<Op, Log2> ||
+      std::is_same_v<Op, Log10> || std::is_same_v<Op, Log1p> ||
+      std::is_same_v<Op, Round> || std::is_same_v<Op, Rsqrt> ||
+      std::is_same_v<Op, Sqrt> || std::is_same_v<Op, Sin> ||
+      std::is_same_v<Op, Sinh> || std::is_same_v<Op, Tan> ||
+      std::is_same_v<Op, Tanh>) {
+    return std::is_same_v<In, Out> && is_inexact_v<In>;
+  }
+  if (std::is_same_v<Op, Imag> || std::is_same_v<Op, Real>) {
+    return mlx::core::is_complex_v<In> && std::is_same_v<Out, float>;
+  }
+  if (std::is_same_v<Op, LogicalNot>) {
+    return std::is_same_v<In, Out> && std::is_same_v<In, bool>;
+  }
+  return false;
+}
+
+} // namespace cu
+
+template <typename Op>
+void unary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    array& out,
+    const char* op,
+    const Stream& s) {
+  auto& in = inputs[0];
+  if (in.size() == 0) {
+    return;
+  }
+  bool contig = in.flags().contiguous;
+  bool large;
+  if (!contig) {
+    large = in.data_size() > INT32_MAX || out.size() > INT32_MAX;
+  } else {
+    large = in.data_size() > UINT32_MAX;
+  }
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+      if constexpr (cu::supports_unary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+        dispatch_bool(large, [&](auto large) {
+          using InType = cuda_type_t<CTYPE_IN>;
+          using OutType = cuda_type_t<CTYPE_OUT>;
+          if (contig) {
+            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+            constexpr int N_READS = 16 / sizeof(OutType);
+            auto [num_blocks, block_dims] = get_launch_args(
+                out.data_size(), out.shape(), out.strides(), large, N_READS);
+            encoder.add_kernel_node(
+                cu::unary_v<Op, InType, OutType, IdxT, N_READS>,
+                num_blocks,
+                block_dims,
+                0,
+                in.data<InType>(),
+                out.data<OutType>(),
+                out.data_size());
+          } else {
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            auto [shape, strides] = collapse_contiguous_dims(in);
+            auto ndim = shape.size();
+            int work_per_thread = 1;
+            auto kernel = cu::unary_g<Op, InType, OutType, IdxT, 1>;
+            auto dim0 = ndim > 0 ? shape.back() : 1;
+            auto rest = out.size() / dim0;
+            if (dim0 >= 4) {
+              kernel = cu::unary_g<Op, InType, OutType, IdxT, 4>;
+              work_per_thread = 4;
+            }
+            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+            auto block_dims = get_block_dims(dim0, rest, 1);
+            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
+            encoder.add_kernel_node(
+                kernel,
+                {num_blocks_x, num_blocks_y},
+                block_dims,
+                0,
+                in.data<InType>(),
+                out.data<OutType>(),
+                rest,
+                const_param(shape),
+                const_param(strides),
+                ndim);
+          }
+        });
+      } else {
+        throw std::runtime_error(fmt::format(
+            "Can not do unary op {} on input of {} with output of {}.",
+            op,
+            dtype_to_string(in.dtype()),
+            dtype_to_string(out.dtype())));
+      }
+    });
+  });
+}
+
+template <typename Op>
+void unary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const char* op,
+    const Stream& s) {
+  set_unary_output_data(inputs[0], out);
+  unary_op_gpu_inplace<Op>(inputs, out, op, s);
+}
+
+#define UNARY_GPU(func)                                               \
+  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
+    nvtx3::scoped_range r(#func "::eval_gpu");                        \
+    auto& s = out.primitive().stream();                               \
+    unary_op_gpu<cu::func>(inputs, out, name(), s);                   \
+  }
+
+} // namespace mlx::core
--- a/mlx/backend/metal/kernels/reduce.metal
+++ b/mlx/backend/metal/kernels/reduce.metal
@@ -134,6 +134,10 @@ instantiate_and_or(and, And)
 instantiate_and_or(or, Or)

 #define instantiate_sum_prod(name, op)                                       \
+  instantiate_reduce_functions(name, uint8, uint8_t, int32_t, op)            \
+  instantiate_reduce_functions(name, uint16, uint16_t, uint32_t, op)         \
+  instantiate_reduce_functions(name, uint32, uint32_t, uint32_t, op)         \
+  instantiate_reduce_functions(name, uint64, uint64_t, uint64_t, op)         \
  instantiate_reduce_functions(name, int8, int8_t, int32_t, op)              \
  instantiate_reduce_functions(name, int16, int16_t, int32_t, op)            \
  instantiate_reduce_functions(name, int32, int32_t, int32_t, op)            \
--- a/mlx/backend/metal/reduce.cpp
+++ b/mlx/backend/metal/reduce.cpp
@@ -247,15 +247,25 @@ std::pair<Dtype, Dtype> remap_reduce_types(
    const std::string& op_name) {
  if (op_name == "sum" || op_name == "prod") {
    if (issubdtype(in.dtype(), integer)) {
-      switch (in.dtype().size()) {
-        case 1:
+      switch (in.dtype()) {
+        case uint8:
+          return {uint8, uint32};
+        case uint16:
+          return {uint16, uint32};
+        case uint32:
+          return {uint32, uint32};
+        case uint64:
+          return {uint64, uint64};
+        case int8:
          return {int8, int32};
-        case 2:
+        case int16:
          return {int16, int32};
-        case 4:
+        case int32:
          return {int32, int32};
-        case 8:
+        case int64:
          return {int64, int64};
+        default:
+          throw std::runtime_error("Unsupported integer type");
      }
    }
    if (in.dtype() == bool_) {
--- a/mlx/ops.cpp
+++ b/mlx/ops.cpp
@@ -2381,9 +2381,20 @@ array logsumexp(
    throw std::invalid_argument(
        "[logsumexp] Received non-empty axes for array with 0 dimensions.");
  }
+  bool reduce_last_dim =
+      !axes.empty() && (axes.back() == a.ndim() - 1 || axes.back() == -1);
+  if (reduce_last_dim) {
+    // For more than 2 axes check if axes is [0, 1, ..., NDIM - 1] and shape
+    // is [1, 1, ..., N].
+    for (int i = axes.size() - 2; i >= 0; --i) {
+      if ((axes[i] + 1 != axes[i + 1]) || (a.shape(axes[i]) != 1)) {
+        reduce_last_dim = false;
+        break;
+      }
+    }
+  }
  bool is_complex = issubdtype(a.dtype(), complexfloating);
-  if (!is_complex && axes.size() == 1 &&
-      (a.ndim() == axes[0] + 1 || axes[0] == -1)) {
+  if (!is_complex && reduce_last_dim) {
    auto dtype = at_least_float(a.dtype());
    auto out_shape = a.shape();
    out_shape.back() = 1;
@@ -3403,10 +3414,20 @@ array softmax(
    throw std::invalid_argument(
        "[softmax] Received non-empty axes for array with 0 dimensions.");
  }
-
+  bool reduce_last_dim =
+      !axes.empty() && (axes.back() == a.ndim() - 1 || axes.back() == -1);
+  if (reduce_last_dim) {
+    // For more than 2 axes check if axes is [0, 1, ..., NDIM - 1] and shape
+    // is [1, 1, ..., N].
+    for (int i = axes.size() - 2; i >= 0; --i) {
+      if ((axes[i] + 1 != axes[i + 1]) || (a.shape(axes[i]) != 1)) {
+        reduce_last_dim = false;
+        break;
+      }
+    }
+  }
  bool is_complex = issubdtype(a.dtype(), complexfloating);
-  if (!is_complex && axes.size() == 1 &&
-      (a.ndim() == axes[0] + 1 || axes[0] == -1)) {
+  if (!is_complex && reduce_last_dim) {
    auto dtype = at_least_float(a.dtype());
    return array(
        a.shape(),
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,6 +2,6 @@
 requires = [
  "setuptools>=80",
  "nanobind==2.4.0",
-  "cmake>=3.25",
+  "cmake>=3.25,<4.1",
 ]
 build-backend = "setuptools.build_meta"
--- a/tests/gpu_tests.cpp
+++ b/tests/gpu_tests.cpp
@@ -155,6 +155,19 @@ TEST_CASE("test gpu reduce") {
    CHECK_EQ(prod(a, Device::gpu).item<int32_t>(), 1);
  }

+  // sum and prod overflow
+  {
+    auto a = full({256, 2, 2}, 1u, uint8);
+    CHECK_EQ(sum(a, Device::gpu).item<uint32_t>(), 256 * 4);
+    CHECK_EQ(prod(a, Device::gpu).item<uint32_t>(), 1);
+
+    a = full({65535, 2, 2}, 1u, uint16);
+    CHECK_EQ(sum(a, Device::gpu).item<uint32_t>(), 65535 * 4);
+    CHECK_EQ(prod(a, Device::gpu).item<uint32_t>(), 1);
+  }
+}
+
+TEST_CASE("test gpu reduce with axes") {
  // reducing only some axes and irregular layouts
  {
    array a(1.0f);
--- a/tests/ops_tests.cpp
+++ b/tests/ops_tests.cpp
@@ -915,6 +915,23 @@ TEST_CASE("test reduction ops") {
    CHECK(array_equal(sum(x, 1), array({3.0f, 6.0f}, {2})).item<bool>());
  }

+  // Test unsigned sum
+  {
+    const int num_elems = 1000;
+
+    auto x = astype(full({num_elems}, 255), uint8);
+    CHECK_EQ(sum(x, Device::cpu).item<uint32_t>(), 255 * num_elems);
+
+    x = astype(full({num_elems}, 65535), uint16);
+    CHECK_EQ(sum(x, Device::cpu).item<uint32_t>(), 65535 * num_elems);
+
+    x = full({3, 3, 3}, 10000, uint32);
+    CHECK_EQ(sum(x, Device::cpu).item<uint32_t>(), 270000);
+
+    x = full({3, 3, 3}, 10000, uint64);
+    CHECK_EQ(sum(x, Device::cpu).item<uint64_t>(), 270000);
+  }
+
  // Test prod
  {
    auto x = array({});
@@ -947,6 +964,21 @@ TEST_CASE("test reduction ops") {
    CHECK(array_equal(prod(x, 1), array({true, false})).item<bool>());
  }

+  // Test unsigned prod
+  {
+    auto x = array({255, 255}, {2}, uint8);
+    CHECK_EQ(prod(x, Device::cpu).item<uint32_t>(), 65025);
+
+    x = array({65535, 2}, {2}, uint16);
+    CHECK_EQ(prod(x, Device::cpu).item<uint32_t>(), 131070);
+
+    x = array({100000, 2}, {2}, uint32);
+    CHECK_EQ(prod(x, Device::cpu).item<uint32_t>(), 200000);
+
+    x = array({100000, 2}, {2}, uint64);
+    CHECK_EQ(prod(x, Device::cpu).item<uint64_t>(), 200000);
+  }
+
  // Test all
  {
    auto x = array({});
Author	SHA1	Message	Date
Awni Hannun	6441c21a94	Faster general unary op (#2472 ) * faster general unary op * faster general ops + reorg * fix + comment * binary two * copy general	2025-08-15 15:04:12 -07:00
Cheng	dfb5022eab	Rename cu::Matmul to CublasGemm (#2488 )	2025-08-13 09:37:40 +09:00
Daniel Yeh	ac207ce7aa	make code blocks copyable (#2480 ) Co-authored-by: Chen-Chen Yeh <ge96noj@mytum.de>	2025-08-12 12:29:02 -07:00
Abe Leininger	fce53b61d6	Fix reduce sum/prod overflow (#2477 )	2025-08-12 00:05:33 -07:00
Angelos Katharopoulos	8ae4a76308	Use CMake <4.1 to avoid the nvpl error (#2489 )	2025-08-12 00:03:42 -07:00
Cheng	7fde1b6a1e	Fix logsumexp/softmax not fused for some cases (#2474 )	2025-08-08 14:07:17 -07:00
Cheng	aa7b47481a	[CUDA] Optimize set_mm_device_pointers for small ndim (#2473 )	2025-08-08 15:23:30 +09:00