Remove segmented reduce and fix row reduce

2025-06-24 09:21:16 +08:00 · 2025-06-19 02:53:41 -07:00 · 2025-06-19 02:53:41 -07:00 · 0ce20290b9
commit 0ce20290b9
parent 6a59c92457
3 changed files with 0 additions and 87 deletions
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@ -32,7 +32,6 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/all_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/col_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/row_reduce.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/segmented_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
--- a/mlx/backend/cuda/reduce/row_reduce.cu
+++ b/mlx/backend/cuda/reduce/row_reduce.cu
@ -249,8 +249,6 @@ __global__ void row_reduce_looped(
  size_t full_blocks = args.row_size / (BLOCK_DIM_X * N_READS);
  size_t final_offset = full_blocks * BLOCK_DIM_X * N_READS;

-  in += elem_to_loc(out_idx, args.shape.data(), args.strides.data(), args.ndim);
-
  for (size_t n = 0; n < args.non_row_reductions; n++) {
    for (size_t r = 0; r < full_blocks; r++) {
      T vals[N_READS];
--- a/mlx/backend/cuda/reduce/segmented_reduce.cu
+++ b/mlx/backend/cuda/reduce/segmented_reduce.cu
@ -1,84 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/cast_op.cuh"
-#include "mlx/backend/cuda/reduce/reduce.cuh"
-
-#include <thrust/device_ptr.h>
-#include <cub/device/device_reduce.cuh>
-#include <cub/device/device_segmented_reduce.cuh>
-
-namespace mlx::core {
-
-template <typename... Args>
-void cub_all_reduce(cu::CommandEncoder& encoder, Args&&... args) {
-  // Allocate temporary storage.
-  size_t size;
-  CHECK_CUDA_ERROR(cub::DeviceReduce::Reduce(nullptr, size, args...));
-  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
-  encoder.add_temporary(temp);
-  // Run op.
-  CHECK_CUDA_ERROR(cub::DeviceReduce::Reduce(temp.data<void>(), size, args...));
-}
-
-template <typename... Args>
-void cub_segmented_reduce(cu::CommandEncoder& encoder, Args&&... args) {
-  // Allocate temporary storage.
-  size_t size;
-  CHECK_CUDA_ERROR(cub::DeviceSegmentedReduce::Reduce(nullptr, size, args...));
-  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
-  encoder.add_temporary(temp);
-  // Run op.
-  CHECK_CUDA_ERROR(
-      cub::DeviceSegmentedReduce::Reduce(temp.data<void>(), size, args...));
-}
-
-struct MultiplyOp {
-  int factor;
-  __device__ int operator()(int i) {
-    return i * factor;
-  }
-};
-
-void segmented_reduce(
-    cu::CommandEncoder& encoder,
-    const array& in,
-    array& out,
-    Reduce::ReduceType reduce_type,
-    const std::vector<int>& axes,
-    const ReductionPlan& plan) {
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE, {
-      MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
-        using InType = cuda_type_t<CTYPE>;
-        using OutType = cu::ReduceResult<OP, InType>::type;
-        auto in_iter = cu::make_cast_iterator<OutType>(
-            thrust::device_pointer_cast(in.data<InType>()));
-        auto out_ptr = thrust::device_pointer_cast(out.data<OutType>());
-        auto init = cu::ReduceInit<OP, InType>::value();
-
-        if (plan.type == ContiguousAllReduce) {
-          cub_all_reduce(
-              encoder, in_iter, out_ptr, in.data_size(), OP(), init, stream);
-        } else if (plan.type == ContiguousReduce) {
-          auto offsets = thrust::make_transform_iterator(
-              thrust::make_counting_iterator(0), MultiplyOp{plan.shape.back()});
-          cub_segmented_reduce(
-              encoder,
-              in_iter,
-              out_ptr,
-              out.size(),
-              offsets,
-              offsets + 1,
-              OP(),
-              init,
-              stream);
-        } else {
-          throw std::runtime_error("Unsupported plan in segmented_reduce.");
-        }
-      });
-    });
-  });
-}
-
-} // namespace mlx::core