Fix a couple of slicing bugs (#1827)

* fix a few bugs * fix conv grad * speedup test * comment
2025-12-16 01:49:05 +08:00 · 2025-02-05 19:50:08 -08:00
parent 9174606d4c
commit af1b725fda
14 changed files with 170 additions and 107 deletions
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -35,4 +35,29 @@ void shared_buffer_slice(
  move_or_copy(in, out, out_strides, flags, data_size, data_offset);
 }

+void slice(
+    const array& in,
+    array& out,
+    const Shape& start_indices,
+    const Shape& strides) {
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+
+  // Calculate out strides, initial offset
+  auto [data_offset, inp_strides] = prepare_slice(in, start_indices, strides);
+  int64_t data_end = 1;
+  for (int i = 0; i < start_indices.size(); ++i) {
+    if (in.shape()[i] > 1) {
+      auto end_idx = start_indices[i] + out.shape()[i] * strides[i] - 1;
+      data_end += end_idx * in.strides()[i];
+    }
+  }
+  // data_end can be -1
+  size_t data_size =
+      data_end < 0 ? (data_offset - data_end) : (data_end - data_offset);
+  shared_buffer_slice(in, inp_strides, data_offset, data_size, out);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/slicing.h
+++ b/mlx/backend/common/slicing.h
@@ -11,11 +11,10 @@ std::tuple<int64_t, Strides> prepare_slice(
    const Shape& start_indices,
    const Shape& strides);

-void shared_buffer_slice(
+void slice(
    const array& in,
-    const Strides& out_strides,
-    size_t data_offset,
-    size_t data_size,
-    array& out);
+    array& out,
+    const Shape& start_indices,
+    const Shape& strides);

 } // namespace mlx::core
--- a/mlx/backend/cpu/primitives.cpp
+++ b/mlx/backend/cpu/primitives.cpp
@@ -86,7 +86,7 @@ void NumberOfElements::eval_cpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
 }
 void Slice::eval_cpu(const std::vector<array>& inputs, array& out) {
-  eval(inputs, out);
+  slice(inputs[0], out, start_indices_, strides_);
 }
 void Split::eval_cpu(
    const std::vector<array>& inputs,
@@ -262,29 +262,6 @@ void Reshape::eval_cpu(const std::vector<array>& inputs, array& out) {
  reshape(inputs[0], out);
 }

-void Slice::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-
-  auto& in = inputs[0];
-
-  // Calculate out strides, initial offset and if copy needs to be made
-  auto [data_offset, inp_strides] = prepare_slice(in, start_indices_, strides_);
-  size_t data_end = 1;
-  for (int i = 0; i < end_indices_.size(); ++i) {
-    if (in.shape()[i] > 1) {
-      auto end_idx = start_indices_[i] + out.shape()[i] * strides_[i] - 1;
-      data_end += end_idx * in.strides()[i];
-    }
-  }
-  size_t data_size = data_end - data_offset;
-  Strides ostrides{inp_strides.begin(), inp_strides.end()};
-  shared_buffer_slice(in, ostrides, data_offset, data_size, out);
-}
-
 void DynamicSlice::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (out.size() == 0) {
    out.set_data(nullptr);
@@ -355,7 +332,8 @@ void SliceUpdate::eval_cpu(const std::vector<array>& inputs, array& out) {
  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype);

  // Calculate out strides, initial offset and if copy needs to be made
-  auto [data_offset, out_strides] = prepare_slice(in, start_indices_, strides_);
+  auto [data_offset, out_strides] =
+      prepare_slice(out, start_indices_, strides_);

  // Do copy
  copy_inplace(
--- a/mlx/backend/metal/kernels/unary.metal
+++ b/mlx/backend/metal/kernels/unary.metal
@@ -11,7 +11,7 @@
  instantiate_kernel(                                                                 \
      "gn1_" #op #in_tname #out_tname, unary_g, in_type, out_type, op, 1, int)        \
  instantiate_kernel(                                                                 \
-      "gn4large" #op #in_tname #out_tname, unary_g, in_type, out_type, op, 4)
+      "gn4large_" #op #in_tname #out_tname, unary_g, in_type, out_type, op, 4)

 #define instantiate_unary_all_same(op, tname, type)   \
  instantiate_unary_all(op, tname, tname, type, type)
--- a/mlx/backend/metal/primitives.cpp
+++ b/mlx/backend/metal/primitives.cpp
@@ -499,8 +499,8 @@ void SliceUpdate::eval_gpu(const std::vector<array>& inputs, array& out) {
      ? CopyType::Vector
      : CopyType::General;
  copy_gpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());
-
-  auto [data_offset, out_strides] = prepare_slice(in, start_indices_, strides_);
+  auto [data_offset, out_strides] =
+      prepare_slice(out, start_indices_, strides_);

  // Do copy
  copy_gpu_inplace(
--- a/mlx/backend/metal/slicing.cpp
+++ b/mlx/backend/metal/slicing.cpp
@@ -14,18 +14,7 @@ void slice_gpu(
    const Shape& start_indices,
    const Shape& strides,
    const Stream& s) {
-  // Calculate out strides and initial offset
-  auto [data_offset, inp_strides] = prepare_slice(in, start_indices, strides);
-
-  size_t data_end = 1;
-  for (int i = 0; i < strides.size(); ++i) {
-    if (in.shape()[i] > 1) {
-      auto end_idx = start_indices[i] + out.shape()[i] * strides[i] - 1;
-      data_end += end_idx * in.strides()[i];
-    }
-  }
-  size_t data_size = data_end - data_offset;
-  shared_buffer_slice(in, inp_strides, data_offset, data_size, out);
+  slice(in, out, start_indices, strides);
 }

 void concatenate_gpu(
--- a/mlx/backend/metal/unary.cpp
+++ b/mlx/backend/metal/unary.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2024 Apple Inc.
-
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/kernels.h"
@@ -49,7 +48,7 @@ void unary_op_gpu_inplace(
  } else {
    kernel_name = "gn" + std::to_string(work_per_thread);
    if (large) {
-      kernel_name += "_large";
+      kernel_name += "large";
    }
  }
  concatenate(kernel_name, "_", op, type_to_name(in), type_to_name(out));