[CUDA] Use ConcurrentContext in concatenate_gpu (#2549)

This commit is contained in:
Cheng
2025-08-28 09:30:08 +09:00
committed by GitHub
parent 584d48458e
commit 31c6f6e33f

View File

@@ -30,8 +30,7 @@ void concatenate_gpu(
flags.row_contiguous = false; flags.row_contiguous = false;
flags.col_contiguous = false; flags.col_contiguous = false;
flags.contiguous = false; flags.contiguous = false;
// TODO: Handle concurrent outputs: auto concurrent = cu::get_command_encoder(s).concurrent_context();
// https://github.com/ml-explore/mlx/pull/2145#discussion_r2070753816
for (int i = 0; i < inputs.size(); i++) { for (int i = 0; i < inputs.size(); i++) {
array out_slice(inputs[i].shape(), out.dtype(), nullptr, {}); array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
size_t data_offset = strides[axis] * sizes[i]; size_t data_offset = strides[axis] * sizes[i];