diff --git a/mlx/backend/cuda/copy.cu b/mlx/backend/cuda/copy.cu index 321806720..158d3de6e 100644 --- a/mlx/backend/cuda/copy.cu +++ b/mlx/backend/cuda/copy.cu @@ -15,8 +15,8 @@ void copy_gpu_inplace( int64_t offset_out, CopyType ctype, const Stream& s, - const std::optional& dynamic_offset_in, - const std::optional& dynamic_offset_out) { + std::optional dynamic_offset_in, + std::optional dynamic_offset_out) { if (out.size() == 0) { return; } @@ -44,6 +44,16 @@ void copy_gpu_inplace( strides_vec[0]); } else { if (dynamic_offset_in || dynamic_offset_out) { + if (!dynamic_offset_in) { + dynamic_offset_in = array(0, int64); + encoder.add_temporary(*dynamic_offset_in); + } + if (!dynamic_offset_out) { + dynamic_offset_out = array(0, int64); + encoder.add_temporary(*dynamic_offset_out); + } + encoder.set_input_array(*dynamic_offset_in); + encoder.set_input_array(*dynamic_offset_out); copy_general_dynamic( encoder, ctype, @@ -54,8 +64,8 @@ void copy_gpu_inplace( shape_collapsed, strides_vec[0], strides_vec[1], - dynamic_offset_in ? *dynamic_offset_in : array(0, int64), - dynamic_offset_out ? *dynamic_offset_out : array(0, int64)); + *dynamic_offset_in, + *dynamic_offset_out); } else { copy_general( encoder, diff --git a/mlx/backend/cuda/indexing.cpp b/mlx/backend/cuda/indexing.cpp index 829529609..1d867e063 100644 --- a/mlx/backend/cuda/indexing.cpp +++ b/mlx/backend/cuda/indexing.cpp @@ -110,7 +110,7 @@ void Gather::eval_gpu(const std::vector& inputs, array& out) { args.append(src.ndim()); args.append_ndim(slice_sizes_); args.append(slice_size); - args.append(SmallVector(axes_.begin(), axes_.end())); + args.append(axes_); append_indices_arg(args, inputs, nidx, idx_ndim); std::string kernel_name = fmt::format( @@ -211,7 +211,7 @@ void Scatter::eval_gpu(const std::vector& inputs, array& out) { args.append_ndim(out.shape()); args.append_ndim(out.strides()); args.append(out.ndim()); - args.append(SmallVector(axes_.begin(), axes_.end())); + args.append(axes_); append_indices_arg(args, inputs, nidx, idx_ndim); std::string kernel_name = fmt::format( diff --git a/mlx/backend/cuda/jit_module.h b/mlx/backend/cuda/jit_module.h index d919f9bc0..cc569690a 100644 --- a/mlx/backend/cuda/jit_module.h +++ b/mlx/backend/cuda/jit_module.h @@ -46,6 +46,11 @@ struct KernelArgs { append_ptr(std::get>(storage_.back()).data()); } + template + void append(const std::vector& vec) { + append(SmallVector(vec.begin(), vec.end())); + } + // Make sure the arg is copied to an array with size of NDIM. template void append_ndim(SmallVector vec) { diff --git a/mlx/backend/cuda/primitives.cpp b/mlx/backend/cuda/primitives.cpp index f9a594ab8..77c295665 100644 --- a/mlx/backend/cuda/primitives.cpp +++ b/mlx/backend/cuda/primitives.cpp @@ -24,8 +24,6 @@ namespace mlx::core { } NO_GPU(BlockMaskedMM) -NO_GPU(DynamicSlice) -NO_GPU(DynamicSliceUpdate) NO_GPU(FFT) NO_GPU(GatherMM) NO_GPU(GatherQMM) diff --git a/mlx/backend/cuda/slicing.cpp b/mlx/backend/cuda/slicing.cpp index af67fbbdd..18cc14bbd 100644 --- a/mlx/backend/cuda/slicing.cpp +++ b/mlx/backend/cuda/slicing.cpp @@ -1,8 +1,11 @@ // Copyright © 2025 Apple Inc. #include "mlx/backend/common/slicing.h" +#include "mlx/backend/cuda/device.h" +#include "mlx/backend/cuda/jit_module.h" #include "mlx/backend/gpu/copy.h" #include "mlx/backend/gpu/slicing.h" +#include "mlx/dtype_utils.h" #include @@ -38,4 +41,71 @@ void concatenate_gpu( } } +array compute_dynamic_offset( + const array& indices, + const Strides& strides, + const std::vector& axes, + const Stream& s) { + Dtype dtype = indices.dtype(); + int nidx = axes.size(); + + std::string module_name = + fmt::format("compute_dynamic_offset_{}_{}", dtype_to_string(dtype), nidx); + std::string kernel_name = fmt::format( + "mlx::core::cu::compute_dynamic_offset<{}, {}>", + dtype_to_cuda_type(dtype), + nidx); + + cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() { + std::string source = R"( + #include "mlx/backend/cuda/device/utils.cuh" + + namespace mlx::core::cu { + + template + __global__ void compute_dynamic_offset( + const T* indices, + int64_t* offset, + const __grid_constant__ Strides strides, + const __grid_constant__ cuda::std::array axes) { + int64_t acc = 0; + #pragma unroll + for (int i = 0; i < NIDX; ++i) { + acc += indices[i] * strides[axes[i]]; + } + *offset = acc; + } + + } // namespace mlx::core::cu + )"; + return std::make_tuple(false, std::move(source), std::vector{kernel_name}); + }); + + // Prepare output. + array offset({1}, int64, nullptr, {}); + bool donate = indices.is_donatable() && + (indices.data_size() * indices.itemsize()) >= offset.itemsize(); + if (donate) { + offset.copy_shared_buffer(indices); + } else { + offset.set_data(allocator::malloc(offset.itemsize())); + } + + auto& encoder = cu::get_command_encoder(s); + encoder.add_temporary(offset); + encoder.set_input_array(indices); + encoder.set_output_array(offset); + + cu::KernelArgs args; + args.append(indices); + args.append(offset); + args.append_ndim(strides); + args.append(axes); + + auto kernel = mod.get_kernel(kernel_name); + encoder.add_kernel_node(kernel, 1, 1, 0, args.args()); + + return offset; +} + } // namespace mlx::core diff --git a/mlx/backend/gpu/copy.h b/mlx/backend/gpu/copy.h index 274250202..6e6bc7978 100644 --- a/mlx/backend/gpu/copy.h +++ b/mlx/backend/gpu/copy.h @@ -20,8 +20,8 @@ void copy_gpu_inplace( int64_t o_offset, CopyType ctype, const Stream& s, - const std::optional& dynamic_i_offset = std::nullopt, - const std::optional& dynamic_o_offset = std::nullopt); + std::optional dynamic_i_offset = std::nullopt, + std::optional dynamic_o_offset = std::nullopt); void copy_gpu(const array& src, array& out, CopyType ctype, const Stream& s); void copy_gpu(const array& src, array& out, CopyType ctype); diff --git a/mlx/backend/gpu/primitives.cpp b/mlx/backend/gpu/primitives.cpp index 98e88ef04..ee40799df 100644 --- a/mlx/backend/gpu/primitives.cpp +++ b/mlx/backend/gpu/primitives.cpp @@ -81,6 +81,7 @@ void Depends::eval_gpu( } void DynamicSlice::eval_gpu(const std::vector& inputs, array& out) { + MLX_PROFILER_RANGE("DynamicSlice::eval_gpu"); if (out.size() == 0) { out.set_data(nullptr); return; @@ -102,13 +103,14 @@ void DynamicSlice::eval_gpu(const std::vector& inputs, array& out) { /* int64_t o_offset = */ 0, /* CopyType ctype = */ CopyType::GeneralGeneral, /* const Stream& s = */ s, - /* const std::optional& dynamic_i_offset = */ in_offset, - /* const std::optional& dynamic_o_offset = */ std::nullopt); + /* std::optional dynamic_i_offset = */ std::move(in_offset), + /* std::optional dynamic_o_offset = */ std::nullopt); } void DynamicSliceUpdate::eval_gpu( const std::vector& inputs, array& out) { + MLX_PROFILER_RANGE("DynamicSliceUpdate::eval_gpu"); if (out.size() == 0) { out.set_data(nullptr); return; @@ -142,8 +144,8 @@ void DynamicSliceUpdate::eval_gpu( /* int64_t o_offset = */ 0, /* CopyType ctype = */ CopyType::GeneralGeneral, /* const Stream& s = */ s, - /* const std::optional& dynamic_i_offset = */ std::nullopt, - /* const std::optional& dynamic_o_offset = */ out_offset); + /* std::optional dynamic_i_offset = */ std::nullopt, + /* std::optional dynamic_o_offset = */ std::move(out_offset)); } void ExpandDims::eval_gpu(const std::vector& inputs, array& out) { diff --git a/mlx/backend/metal/copy.cpp b/mlx/backend/metal/copy.cpp index 915fc69fd..d9c568e52 100644 --- a/mlx/backend/metal/copy.cpp +++ b/mlx/backend/metal/copy.cpp @@ -20,8 +20,8 @@ void copy_gpu_inplace( int64_t out_offset, CopyType ctype, const Stream& s, - const std::optional& dynamic_i_offset /* = std::nullopt */, - const std::optional& dynamic_o_offset /* = std::nullopt */) { + std::optional dynamic_i_offset /* = std::nullopt */, + std::optional dynamic_o_offset /* = std::nullopt */) { if (out.size() == 0) { return; } diff --git a/python/tests/cuda_skip.py b/python/tests/cuda_skip.py index 2cc4a6c17..af5bace9a 100644 --- a/python/tests/cuda_skip.py +++ b/python/tests/cuda_skip.py @@ -1,7 +1,6 @@ cuda_skip = { "TestLoad.test_load_f8_e4m3", "TestLayers.test_quantized_embedding", - "TestOps.test_dynamic_slicing", # Block masked matmul NYI "TestBlas.test_block_masked_matmul", # Gather matmul NYI