[CUDA] Implement DynamicSlice/DynamicSliceUpdate (#2533)

* Move DynamicSlice to gpu/primitives * Implement compute_dynamic_offset in CUDA
2025-12-16 01:49:05 +08:00 · 2025-08-26 07:31:39 +09:00
parent 2ca75bb529
commit 4822c3dbe9
12 changed files with 226 additions and 134 deletions
--- a/mlx/backend/cuda/slicing.cpp
+++ b/mlx/backend/cuda/slicing.cpp
@@ -1,8 +1,11 @@
 // Copyright © 2025 Apple Inc.

 #include "mlx/backend/common/slicing.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/jit_module.h"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/backend/gpu/slicing.h"
+#include "mlx/dtype_utils.h"

 #include <numeric>

@@ -38,4 +41,71 @@ void concatenate_gpu(
  }
 }

+array compute_dynamic_offset(
+    const array& indices,
+    const Strides& strides,
+    const std::vector<int>& axes,
+    const Stream& s) {
+  Dtype dtype = indices.dtype();
+  int nidx = axes.size();
+
+  std::string module_name =
+      fmt::format("compute_dynamic_offset_{}_{}", dtype_to_string(dtype), nidx);
+  std::string kernel_name = fmt::format(
+      "mlx::core::cu::compute_dynamic_offset<{}, {}>",
+      dtype_to_cuda_type(dtype),
+      nidx);
+
+  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
+    std::string source = R"(
+        #include "mlx/backend/cuda/device/utils.cuh"
+
+        namespace mlx::core::cu {
+
+        template <typename T, int NIDX>
+        __global__ void compute_dynamic_offset(
+            const T* indices,
+            int64_t* offset,
+            const __grid_constant__ Strides strides,
+            const __grid_constant__ cuda::std::array<int, NIDX> axes) {
+          int64_t acc = 0;
+          #pragma unroll
+          for (int i = 0; i < NIDX; ++i) {
+            acc += indices[i] * strides[axes[i]];
+          }
+          *offset = acc;
+        }
+
+        } // namespace mlx::core::cu
+    )";
+    return std::make_tuple(false, std::move(source), std::vector{kernel_name});
+  });
+
+  // Prepare output.
+  array offset({1}, int64, nullptr, {});
+  bool donate = indices.is_donatable() &&
+      (indices.data_size() * indices.itemsize()) >= offset.itemsize();
+  if (donate) {
+    offset.copy_shared_buffer(indices);
+  } else {
+    offset.set_data(allocator::malloc(offset.itemsize()));
+  }
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.add_temporary(offset);
+  encoder.set_input_array(indices);
+  encoder.set_output_array(offset);
+
+  cu::KernelArgs args;
+  args.append(indices);
+  args.append(offset);
+  args.append_ndim(strides);
+  args.append(axes);
+
+  auto kernel = mod.get_kernel(kernel_name);
+  encoder.add_kernel_node(kernel, 1, 1, 0, args.args());
+
+  return offset;
+}
+
 } // namespace mlx::core