[CUDA] Remove thrust in arange (#2535)

2025-12-07 03:18:15 +08:00 · 2025-08-24 16:22:36 +09:00
parent f55b6f1f2f
commit 333ffea273
2 changed files with 33 additions and 31 deletions
--- a/mlx/backend/cuda/arange.cu
+++ b/mlx/backend/cuda/arange.cu
@@ -6,23 +6,33 @@
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"

+#include <cooperative_groups.h>
 #include <nvtx3/nvtx3.hpp>
-#include <thrust/device_ptr.h>
-#include <thrust/transform.h>

 namespace mlx::core {

 namespace cu {

-template <typename T>
-struct Arange {
-  const T start;
-  const T step;
+namespace cg = cooperative_groups;

-  __device__ T operator()(uint32_t i) const {
-    return start + i * step;
+template <typename T, typename IdxT, int N_WRITES>
+__global__ void arange(T* out, IdxT size, T start, T step) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_WRITES > size) {
+    for (IdxT i = index * N_WRITES; i < size; ++i) {
+      out[i] = start + i * step;
+    }
+  } else {
+    AlignedVector<T, N_WRITES> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_WRITES; ++i) {
+      out_vec[i] = start + (index * N_WRITES + i) * step;
+    }
+
+    store_vector<N_WRITES>(out, index, out_vec);
  }
-};
+}

 } // namespace cu

@@ -36,19 +46,23 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(stream());
  encoder.set_output_array(out);

-  auto capture = encoder.capture_context();
  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
    using CTYPE = MLX_GET_TYPE(type_tag);
    using OutType = cuda_type_t<CTYPE>;
-    CTYPE step =
-        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
-    thrust::transform(
-        cu::thrust_policy(encoder.stream()),
-        thrust::counting_iterator<uint32_t>(0),
-        thrust::counting_iterator<uint32_t>(out.data_size()),
-        thrust::device_pointer_cast(out.data<OutType>()),
-        cu::Arange<OutType>{
-            static_cast<OutType>(start_), static_cast<OutType>(step)});
+    constexpr int N_WRITES = 16 / sizeof(OutType);
+    dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+      using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+      auto [num_blocks, block_dims] = get_launch_args(out, large(), N_WRITES);
+      encoder.add_kernel_node(
+          cu::arange<OutType, IdxT, N_WRITES>,
+          num_blocks,
+          block_dims,
+          0,
+          out.data<OutType>(),
+          out.data_size(),
+          static_cast<CTYPE>(start_),
+          static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_));
+    });
  });
 }

--- a/mlx/backend/cuda/device/cast_op.cuh
+++ b/mlx/backend/cuda/device/cast_op.cuh
@@ -6,7 +6,6 @@

 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
-#include <thrust/iterator/transform_iterator.h>

 namespace mlx::core::cu {

@@ -116,15 +115,4 @@ inline __host__ __device__ auto cast_to(SrcT x) {
  return CastOp<SrcT, DstT>{}(x);
 }

-// Return an iterator that cast the value to DstT using CastOp.
-template <typename DstT, typename Iterator>
-inline __host__ __device__ auto make_cast_iterator(Iterator it) {
-  using SrcT = typename cuda::std::iterator_traits<Iterator>::value_type;
-  if constexpr (std::is_same_v<SrcT, DstT>) {
-    return it;
-  } else {
-    return thrust::make_transform_iterator(it, CastOp<SrcT, DstT>{});
-  }
-}
-
 } // namespace mlx::core::cu