// Copyright © 2025 Apple Inc. #include "mlx/backend/common/utils.h" #include "mlx/backend/cuda/copy/copy.cuh" namespace mlx::core { void copy_gpu_inplace( const array& in, array& out, const Shape& shape, const Strides& strides_in, const Strides& strides_out, int64_t offset_in, int64_t offset_out, CopyType ctype, const Stream& s, std::optional dynamic_offset_in, std::optional dynamic_offset_out) { if (out.size() == 0) { return; } auto& encoder = cu::get_command_encoder(s); encoder.set_input_array(in); encoder.set_output_array(out); if (ctype == CopyType::Scalar || ctype == CopyType::Vector) { copy_contiguous(encoder, ctype, in, out, offset_in, offset_out); return; } if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) { auto [shape_collapsed, strides_vec] = collapse_contiguous_dims( shape, std::vector{strides_in, strides_out}, INT32_MAX); if (ctype == CopyType::General) { copy_general_input( encoder, ctype, in, out, offset_in, offset_out, shape_collapsed, strides_vec[0]); } else { if (dynamic_offset_in || dynamic_offset_out) { if (!dynamic_offset_in) { dynamic_offset_in = array(0, int64); encoder.add_temporary(*dynamic_offset_in); } if (!dynamic_offset_out) { dynamic_offset_out = array(0, int64); encoder.add_temporary(*dynamic_offset_out); } encoder.set_input_array(*dynamic_offset_in); encoder.set_input_array(*dynamic_offset_out); copy_general_dynamic( encoder, ctype, in, out, offset_in, offset_out, shape_collapsed, strides_vec[0], strides_vec[1], *dynamic_offset_in, *dynamic_offset_out); } else { copy_general( encoder, ctype, in, out, offset_in, offset_out, shape_collapsed, strides_vec[0], strides_vec[1]); } } return; } } void fill_gpu(const array& in, array& out, const Stream& s) { if (out.size() == 0) { return; } out.set_data(allocator::malloc(out.nbytes())); auto& encoder = cu::get_command_encoder(s); encoder.set_input_array(in); encoder.set_output_array(out); copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0); } } // namespace mlx::core