mirror of
https://github.com/ml-explore/mlx.git
synced 2025-06-30 05:31:15 +08:00
43 lines
1.2 KiB
C++
43 lines
1.2 KiB
C++
// Copyright © 2024 Apple Inc.
|
|
|
|
#include <numeric>
|
|
|
|
#include "mlx/backend/gpu/copy.h"
|
|
#include "mlx/backend/gpu/slicing.h"
|
|
#include "mlx/backend/metal/device.h"
|
|
|
|
namespace mlx::core {
|
|
|
|
void concatenate_gpu(
|
|
const std::vector<array>& inputs,
|
|
array& out,
|
|
int axis,
|
|
const Stream& s) {
|
|
std::vector<int> sizes;
|
|
sizes.push_back(0);
|
|
for (auto& p : inputs) {
|
|
sizes.push_back(p.shape(axis));
|
|
}
|
|
std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());
|
|
|
|
out.set_data(allocator::malloc(out.nbytes()));
|
|
|
|
auto strides = out.strides();
|
|
auto flags = out.flags();
|
|
flags.row_contiguous = false;
|
|
flags.col_contiguous = false;
|
|
flags.contiguous = false;
|
|
auto& d = metal::device(s.device);
|
|
auto& compute_encoder = d.get_command_encoder(s.index);
|
|
auto concurrent_ctx = compute_encoder.start_concurrent();
|
|
for (int i = 0; i < inputs.size(); i++) {
|
|
array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
|
|
size_t data_offset = strides[axis] * sizes[i];
|
|
out_slice.copy_shared_buffer(
|
|
out, strides, flags, out_slice.size(), data_offset);
|
|
copy_gpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, s);
|
|
}
|
|
}
|
|
|
|
} // namespace mlx::core
|