From 3eb59aab6ec5d418965f978783a6c70051bf695b Mon Sep 17 00:00:00 2001
From: Cheng <git@zcbenz.com>
Date: Tue, 8 Jul 2025 00:22:12 +0000
Subject: [PATCH] Do vectorized store/load in copy ops

---
 mlx/backend/cuda/copy/copy_contiguous.cu | 59 ++++++++++++++++++++----
 1 file changed, 50 insertions(+), 9 deletions(-)
diff --git a/mlx/backend/cuda/copy/copy_contiguous.cu b/mlx/backend/cuda/copy/copy_contiguous.cu
index 408350129..60f66f984 100644
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -10,19 +10,53 @@ namespace cu {
 
 namespace cg = cooperative_groups;
 
-template <typename In, typename Out, typename IdxT>
+template <typename In, typename Out, typename IdxT, int N_READS>
 __global__ void copy_s(const In* in, Out* out, IdxT size) {
   IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = CastOp<In, Out>{}(in[0]);
+  int remaining = size - index * N_READS;
+  if (remaining <= 0) {
+    return;
+  }
+
+  if (remaining < N_READS) {
+    for (int i = 0; i < remaining; ++i) {
+      IdxT offset = index * N_READS + i;
+      out[offset] = CastOp<In, Out>{}(in[0]);
+    }
+  } else {
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = CastOp<In, Out>{}(in[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
   }
 }
 
-template <typename In, typename Out, typename IdxT>
+template <typename In, typename Out, typename IdxT, int N_READS>
 __global__ void copy_v(const In* in, Out* out, IdxT size) {
   IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = CastOp<In, Out>{}(in[index]);
+  int remaining = size - index * N_READS;
+  if (remaining <= 0) {
+    return;
+  }
+
+  if (remaining < N_READS) {
+    for (int i = 0; i < remaining; ++i) {
+      IdxT offset = index * N_READS + i;
+      out[offset] = CastOp<In, Out>{}(in[offset]);
+    }
+  } else {
+    auto in_vec = load_vector<N_READS>(in, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = CastOp<In, Out>{}(in_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
   }
 }
 
@@ -41,12 +75,19 @@ void copy_contiguous(
         using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
         using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
         using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-        auto kernel = cu::copy_s<InType, OutType, IdxT>;
+        // TODO: Choose optimized value based on type size.
+        constexpr int N_READS = 4;
+        auto kernel = cu::copy_s<InType, OutType, IdxT, N_READS>;
         if (ctype == CopyType::Vector) {
-          kernel = cu::copy_v<InType, OutType, IdxT>;
+          kernel = cu::copy_v<InType, OutType, IdxT, N_READS>;
         }
         auto [num_blocks, block_dims] = get_launch_args(
-            kernel, out.data_size(), out.shape(), out.strides(), large());
+            kernel,
+            out.data_size(),
+            out.shape(),
+            out.strides(),
+            large(),
+            N_READS);
         encoder.add_kernel_node(
             kernel,
             num_blocks,