rebase + nit (#2260)

Co-authored-by: Awni Hannun <awni@apple.com>
2025-12-16 01:49:05 +08:00 · 2025-06-11 02:51:51 +09:00
parent 62fecf3e13
commit 99c33d011d
10 changed files with 604 additions and 28 deletions
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -0,0 +1,89 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+namespace mlx::core {
+
+void copy_gpu_inplace(
+    const array& in_,
+    array& out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out,
+    int64_t offset_in,
+    int64_t offset_out,
+    CopyType ctype,
+    const Stream& s,
+    const std::optional<array>& dynamic_offset_in,
+    const std::optional<array>& dynamic_offset_out) {
+  if (out.size() == 0) {
+    return;
+  }
+  const array& in = in_.data_shared_ptr() ? in_ : out;
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+
+  if (ctype == CopyType::Scalar || ctype == CopyType::Vector) {
+    copy_contiguous(encoder, ctype, in, out, offset_in, offset_out);
+    return;
+  }
+
+  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
+    auto [shape_collapsed, strides_vec] = collapse_contiguous_dims(
+        shape, std::vector{strides_in, strides_out}, INT32_MAX);
+    if (ctype == CopyType::General) {
+      copy_general_input(
+          encoder,
+          ctype,
+          in,
+          out,
+          offset_in,
+          offset_out,
+          shape_collapsed,
+          strides_vec[0]);
+    } else {
+      if (dynamic_offset_in || dynamic_offset_out) {
+        copy_general_dynamic(
+            encoder,
+            ctype,
+            in,
+            out,
+            offset_in,
+            offset_out,
+            shape_collapsed,
+            strides_vec[0],
+            strides_vec[1],
+            dynamic_offset_in ? *dynamic_offset_in : array(0, int64),
+            dynamic_offset_out ? *dynamic_offset_out : array(0, int64));
+      } else {
+        copy_general(
+            encoder,
+            ctype,
+            in,
+            out,
+            offset_in,
+            offset_out,
+            shape_collapsed,
+            strides_vec[0],
+            strides_vec[1]);
+      }
+    }
+    return;
+  }
+}
+
+void fill_gpu(const array& in, array& out, const Stream& s) {
+  if (out.size() == 0) {
+    return;
+  }
+  out.set_data(allocator::malloc(out.nbytes()));
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
+}
+
+} // namespace mlx::core