Move arange to its own file (#2438)

2025-11-01 00:28:11 +08:00 · 2025-07-30 13:05:51 +09:00
parent 254476718b
commit a0ae49d397
4 changed files with 57 additions and 52 deletions
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -6,6 +6,7 @@
 target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/arange.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
@@ -29,7 +30,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/layer_norm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/random.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/all_reduce.cu
--- a/mlx/backend/cuda/arange.cu
+++ b/mlx/backend/cuda/arange.cu
@@ -0,0 +1,55 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/fp16_math.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <nvtx3/nvtx3.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+template <typename T>
+struct Arange {
+  const T start;
+  const T step;
+
+  __device__ T operator()(uint32_t i) const {
+    return start + i * step;
+  }
+};
+
+} // namespace cu
+
+void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Arange::eval_gpu");
+  if (out.size() == 0) {
+    return;
+  }
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  auto& encoder = cu::get_command_encoder(stream());
+  encoder.set_output_array(out);
+
+  auto capture = encoder.capture_context();
+  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
+    using CTYPE = MLX_GET_TYPE(type_tag);
+    using OutType = cuda_type_t<CTYPE>;
+    CTYPE step =
+        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
+    thrust::transform(
+        cu::thrust_policy(encoder.stream()),
+        thrust::counting_iterator<uint32_t>(0),
+        thrust::counting_iterator<uint32_t>(out.data_size()),
+        thrust::device_pointer_cast(out.data<OutType>()),
+        cu::Arange<OutType>{
+            static_cast<OutType>(start_), static_cast<OutType>(step)});
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/device/arange.cuh
+++ b/mlx/backend/cuda/device/arange.cuh
@@ -1,15 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-namespace mlx::core::cu {
-
-template <typename T>
-struct Arange {
-  const T start;
-  const T step;
-
-  __device__ T operator()(uint32_t i) const {
-    return start + i * step;
-  }
-};
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/primitives.cpp
+++ b/mlx/backend/cuda/primitives.cpp
@@ -1,47 +1,11 @@
 // Copyright © 2025 Apple Inc.

-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/arange.cuh"
-#include "mlx/backend/cuda/device/fp16_math.cuh"
-#include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/distributed/primitives.h"
-#include "mlx/dtype_utils.h"
 #include "mlx/fast_primitives.h"
 #include "mlx/primitives.h"

-#include <nvtx3/nvtx3.hpp>
-#include <thrust/device_ptr.h>
-#include <thrust/transform.h>
-
-#include <cassert>
-
 namespace mlx::core {

-void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("Arange::eval_gpu");
-  assert(inputs.size() == 0);
-  out.set_data(allocator::malloc(out.nbytes()));
-  if (out.size() == 0) {
-    return;
-  }
-  auto& encoder = cu::get_command_encoder(stream());
-  encoder.set_output_array(out);
-  auto capture = encoder.capture_context();
-  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
-    using CTYPE = MLX_GET_TYPE(type_tag);
-    using OutType = cuda_type_t<CTYPE>;
-    CTYPE step =
-        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
-    thrust::transform(
-        cu::thrust_policy(encoder.stream()),
-        thrust::counting_iterator<uint32_t>(0),
-        thrust::counting_iterator<uint32_t>(out.data_size()),
-        thrust::device_pointer_cast(out.data<OutType>()),
-        cu::Arange<OutType>{
-            static_cast<OutType>(start_), static_cast<OutType>(step)});
-  });
-}
-
 bool fast::ScaledDotProductAttention::use_fallback(
    const array& q,
    const array& k,