reduce binary size (#1952)

2025-12-16 01:49:05 +08:00 · 2025-03-11 06:30:44 -07:00
parent 117e1355a2
commit 736a340478
16 changed files with 2145 additions and 2386 deletions
--- a/mlx/backend/cpu/arg_reduce.cpp
+++ b/mlx/backend/cpu/arg_reduce.cpp
@@ -11,12 +11,7 @@ namespace mlx::core {
 namespace {

 template <typename InT, typename OpT>
-void arg_reduce(
-    const array& in,
-    array& out,
-    const OpT& op,
-    int axis,
-    Stream stream) {
+void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto axis_size = in.shape()[axis];
  auto axis_stride = in.strides()[axis];
  Strides strides = in.strides();
@@ -26,28 +21,16 @@ void arg_reduce(
  auto in_ptr = in.data<InT>();
  auto out_ptr = out.data<uint32_t>();

-  auto& encoder = cpu::get_command_encoder(stream);
-  encoder.set_input_array(in);
-  encoder.set_output_array(out);
-  encoder.dispatch([in_ptr,
-                    out_ptr,
-                    axis_size,
-                    axis_stride,
-                    op = std::move(op),
-                    shape = std::move(shape),
-                    strides = std::move(strides),
-                    size = out.size()]() {
-    for (uint32_t i = 0; i < size; ++i) {
-      auto loc = elem_to_loc(i, shape, strides);
-      auto local_in_ptr = in_ptr + loc;
-      uint32_t ind_v = 0;
-      InT v = (*local_in_ptr);
-      for (uint32_t j = 0; j < axis_size; ++j, local_in_ptr += axis_stride) {
-        op(j, (*local_in_ptr), &ind_v, &v);
-      }
-      out_ptr[i] = ind_v;
+  for (uint32_t i = 0; i < out.size(); ++i) {
+    auto loc = elem_to_loc(i, shape, strides);
+    auto local_in_ptr = in_ptr + loc;
+    uint32_t ind_v = 0;
+    InT v = (*local_in_ptr);
+    for (uint32_t j = 0; j < axis_size; ++j, local_in_ptr += axis_stride) {
+      op(j, (*local_in_ptr), &ind_v, &v);
    }
-  });
+    out_ptr[i] = ind_v;
+  }
 }

 template <typename InT>
@@ -55,8 +38,7 @@ void arg_reduce_dispatch(
    const array& in,
    array& out,
    ArgReduce::ReduceType rtype,
-    int axis,
-    Stream stream) {
+    int axis) {
  switch (rtype) {
    case ArgReduce::ArgMin: {
      auto op = [](auto ind_x, auto x, auto ind_y, auto y) {
@@ -65,7 +47,7 @@ void arg_reduce_dispatch(
          (*ind_y) = ind_x;
        }
      };
-      arg_reduce<InT>(in, out, op, axis, stream);
+      arg_reduce<InT>(in, out, op, axis);
      break;
    }
    case ArgReduce::ArgMax: {
@@ -75,7 +57,7 @@ void arg_reduce_dispatch(
          (*ind_y) = ind_x;
        }
      };
-      arg_reduce<InT>(in, out, op, axis, stream);
+      arg_reduce<InT>(in, out, op, axis);
      break;
    }
  }
@@ -87,51 +69,58 @@ void ArgReduce::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-  switch (in.dtype()) {
-    case bool_:
-      arg_reduce_dispatch<bool>(in, out, reduce_type_, axis_, stream());
-      break;
-    case uint8:
-      arg_reduce_dispatch<uint8_t>(in, out, reduce_type_, axis_, stream());
-      break;
-    case uint16:
-      arg_reduce_dispatch<uint16_t>(in, out, reduce_type_, axis_, stream());
-      break;
-    case uint32:
-      arg_reduce_dispatch<uint32_t>(in, out, reduce_type_, axis_, stream());
-      break;
-    case uint64:
-      arg_reduce_dispatch<uint64_t>(in, out, reduce_type_, axis_, stream());
-      break;
-    case int8:
-      arg_reduce_dispatch<int8_t>(in, out, reduce_type_, axis_, stream());
-      break;
-    case int16:
-      arg_reduce_dispatch<int16_t>(in, out, reduce_type_, axis_, stream());
-      break;
-    case int32:
-      arg_reduce_dispatch<int32_t>(in, out, reduce_type_, axis_, stream());
-      break;
-    case int64:
-      arg_reduce_dispatch<int64_t>(in, out, reduce_type_, axis_, stream());
-      break;
-    case float16:
-      arg_reduce_dispatch<float16_t>(in, out, reduce_type_, axis_, stream());
-      break;
-    case float32:
-      arg_reduce_dispatch<float>(in, out, reduce_type_, axis_, stream());
-      break;
-    case bfloat16:
-      arg_reduce_dispatch<bfloat16_t>(in, out, reduce_type_, axis_, stream());
-      break;
-    case float64:
-      arg_reduce_dispatch<double>(in, out, reduce_type_, axis_, stream());
-      break;
-    case complex64:
-      arg_reduce_dispatch<complex64_t>(in, out, reduce_type_, axis_, stream());
-      break;
-  }
+  auto& encoder = cpu::get_command_encoder(stream());
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  encoder.dispatch([in = array::unsafe_weak_copy(in),
+                    out = array::unsafe_weak_copy(out),
+                    reduce_type_ = reduce_type_,
+                    axis_ = axis_]() mutable {
+    switch (in.dtype()) {
+      case bool_:
+        arg_reduce_dispatch<bool>(in, out, reduce_type_, axis_);
+        break;
+      case uint8:
+        arg_reduce_dispatch<uint8_t>(in, out, reduce_type_, axis_);
+        break;
+      case uint16:
+        arg_reduce_dispatch<uint16_t>(in, out, reduce_type_, axis_);
+        break;
+      case uint32:
+        arg_reduce_dispatch<uint32_t>(in, out, reduce_type_, axis_);
+        break;
+      case uint64:
+        arg_reduce_dispatch<uint64_t>(in, out, reduce_type_, axis_);
+        break;
+      case int8:
+        arg_reduce_dispatch<int8_t>(in, out, reduce_type_, axis_);
+        break;
+      case int16:
+        arg_reduce_dispatch<int16_t>(in, out, reduce_type_, axis_);
+        break;
+      case int32:
+        arg_reduce_dispatch<int32_t>(in, out, reduce_type_, axis_);
+        break;
+      case int64:
+        arg_reduce_dispatch<int64_t>(in, out, reduce_type_, axis_);
+        break;
+      case float16:
+        arg_reduce_dispatch<float16_t>(in, out, reduce_type_, axis_);
+        break;
+      case float32:
+        arg_reduce_dispatch<float>(in, out, reduce_type_, axis_);
+        break;
+      case bfloat16:
+        arg_reduce_dispatch<bfloat16_t>(in, out, reduce_type_, axis_);
+        break;
+      case float64:
+        arg_reduce_dispatch<double>(in, out, reduce_type_, axis_);
+        break;
+      case complex64:
+        arg_reduce_dispatch<complex64_t>(in, out, reduce_type_, axis_);
+        break;
+    }
+  });
 }

 } // namespace mlx::core