mirror of
https://github.com/ml-explore/mlx.git
synced 2025-12-16 01:49:05 +08:00
redesign for faster cpu/gpu synch (#1869)
* redesign for faster cpu/gpu synch * load + more async CPU * use command encoder API and move more ops to use it * make fence back-end generic + CPU only fence * faster build * fix async eval * fixes + handle temporaries * fix / improve cpu conv * remove unused status, fix siblings * fix extensions * fix * fix no cpu build * format * comments * fix perf regression, remove unecessary abort * fix events, task limit cpu * fix waiting * fix donation / temporaries in normalization
This commit is contained in:
@@ -3,18 +3,76 @@
|
||||
#include <cstring>
|
||||
#include "mlx/array.h"
|
||||
#include "mlx/backend/cpu/copy.h"
|
||||
#include "mlx/backend/cpu/encoder.h"
|
||||
#include "mlx/backend/cpu/gemm.h"
|
||||
#include "mlx/primitives.h"
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
template <typename T>
|
||||
void matmul_dispatch(
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& out,
|
||||
bool a_transposed,
|
||||
bool b_transposed,
|
||||
size_t lda,
|
||||
size_t ldb,
|
||||
float alpha,
|
||||
float beta,
|
||||
Stream stream) {
|
||||
const T* a_ptr = a.data<T>();
|
||||
const T* b_ptr = b.data<T>();
|
||||
T* out_ptr = out.data<T>();
|
||||
size_t ldc = out.shape(-1);
|
||||
size_t batch_size = a.size() / (a.shape(-2) * a.shape(-1));
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([a_ptr,
|
||||
b_ptr,
|
||||
out_ptr,
|
||||
a_transposed,
|
||||
b_transposed,
|
||||
lda,
|
||||
ldb,
|
||||
ldc,
|
||||
alpha,
|
||||
beta,
|
||||
batch_size,
|
||||
a_shape = a.shape(),
|
||||
a_strides = a.strides(),
|
||||
b_shape = b.shape(),
|
||||
b_strides = b.strides()]() {
|
||||
matmul<T>(
|
||||
a_ptr,
|
||||
b_ptr,
|
||||
out_ptr,
|
||||
a_transposed,
|
||||
b_transposed,
|
||||
lda,
|
||||
ldb,
|
||||
ldc,
|
||||
alpha,
|
||||
beta,
|
||||
batch_size,
|
||||
a_shape,
|
||||
a_strides,
|
||||
b_shape,
|
||||
b_strides);
|
||||
});
|
||||
}
|
||||
|
||||
void matmul_general(
|
||||
const array& a_pre,
|
||||
const array& b_pre,
|
||||
array& out,
|
||||
Stream stream,
|
||||
float alpha = 1.0f,
|
||||
float beta = 0.0f) {
|
||||
auto check_transpose = [](const array& arr) {
|
||||
std::vector<array> temps;
|
||||
auto check_transpose = [stream, &temps](const array& arr) {
|
||||
auto stx = arr.strides()[arr.ndim() - 2];
|
||||
auto sty = arr.strides()[arr.ndim() - 1];
|
||||
if (stx == arr.shape(-1) && sty == 1) {
|
||||
@@ -22,10 +80,10 @@ void matmul_general(
|
||||
} else if (stx == 1 && sty == arr.shape(-2)) {
|
||||
return std::make_tuple(true, sty, arr);
|
||||
} else {
|
||||
array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
|
||||
copy(arr, arr_copy, CopyType::General);
|
||||
temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
|
||||
copy(arr, temps.back(), CopyType::General, stream);
|
||||
stx = arr.shape(-1);
|
||||
return std::make_tuple(false, stx, arr_copy);
|
||||
return std::make_tuple(false, stx, temps.back());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -39,28 +97,34 @@ void matmul_general(
|
||||
}
|
||||
|
||||
if (out.dtype() == float32) {
|
||||
matmul<float>(a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
|
||||
matmul_dispatch<float>(
|
||||
a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
|
||||
} else if (out.dtype() == float16) {
|
||||
matmul<float16_t>(
|
||||
a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
|
||||
matmul_dispatch<float16_t>(
|
||||
a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
|
||||
} else if (out.dtype() == bfloat16) {
|
||||
matmul<bfloat16_t>(
|
||||
a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
|
||||
matmul_dispatch<bfloat16_t>(
|
||||
a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
|
||||
} else if (out.dtype() == float64) {
|
||||
matmul<double>(
|
||||
a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
|
||||
matmul_dispatch<double>(
|
||||
a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
|
||||
} else {
|
||||
throw std::runtime_error("[Matmul::eval_cpu] Invalid type.");
|
||||
}
|
||||
cpu::get_command_encoder(stream).add_temporaries(std::move(temps));
|
||||
}
|
||||
|
||||
void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
out.set_data(allocator::malloc_or_wait(out.nbytes()));
|
||||
if (inputs[0].shape(-1) == 0) {
|
||||
std::memset(out.data<void>(), 0, out.nbytes());
|
||||
auto& encoder = cpu::get_command_encoder(stream());
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([out_ptr = out.data<void>(), nbytes = out.nbytes()]() {
|
||||
std::memset(out_ptr, 0, nbytes);
|
||||
});
|
||||
return;
|
||||
}
|
||||
return matmul_general(inputs[0], inputs[1], out);
|
||||
matmul_general(inputs[0], inputs[1], out, stream());
|
||||
}
|
||||
|
||||
void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
@@ -74,9 +138,9 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
CopyType ctype = c.data_size() == 1
|
||||
? CopyType::Scalar
|
||||
: (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
|
||||
copy(c, out, ctype);
|
||||
copy(c, out, ctype, stream());
|
||||
|
||||
return matmul_general(inputs[0], inputs[1], out, alpha_, beta_);
|
||||
matmul_general(inputs[0], inputs[1], out, stream(), alpha_, beta_);
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
|
||||
Reference in New Issue
Block a user