mirror of
https://github.com/ml-explore/mlx.git
synced 2025-09-19 10:48:09 +08:00
QR factorization (#310)
* add qr factorization --------- Co-authored-by: Awni Hannun <awni@apple.com>
This commit is contained in:
@@ -19,7 +19,7 @@ target_sources(
|
||||
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
|
||||
if (MLX_BUILD_ACCELERATE)
|
||||
if (MLX_BUILD_ACCELERATE)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/accelerate)
|
||||
else()
|
||||
target_sources(
|
||||
|
@@ -65,6 +65,7 @@ DEFAULT(Sort)
|
||||
DEFAULT(StopGradient)
|
||||
DEFAULT(Transpose)
|
||||
DEFAULT_MULTI(DivMod)
|
||||
DEFAULT_MULTI(QRF)
|
||||
|
||||
void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 1);
|
||||
|
@@ -16,4 +16,5 @@ target_sources(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
|
||||
)
|
||||
|
@@ -97,6 +97,7 @@ DEFAULT(Tan)
|
||||
DEFAULT(Tanh)
|
||||
DEFAULT(Transpose)
|
||||
DEFAULT_MULTI(DivMod)
|
||||
DEFAULT_MULTI(QRF)
|
||||
|
||||
namespace {
|
||||
|
||||
|
153
mlx/backend/common/qrf.cpp
Normal file
153
mlx/backend/common/qrf.cpp
Normal file
@@ -0,0 +1,153 @@
|
||||
// Copyright © 2023-2024 Apple Inc.
|
||||
|
||||
#include "mlx/allocator.h"
|
||||
#include "mlx/backend/common/copy.h"
|
||||
#include "mlx/primitives.h"
|
||||
|
||||
#ifdef ACCELERATE_NEW_LAPACK
|
||||
#include <vecLib/lapack.h>
|
||||
#else
|
||||
#include <lapack.h>
|
||||
#endif
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
template <typename T>
|
||||
struct lpack;
|
||||
|
||||
template <>
|
||||
struct lpack<float> {
|
||||
static void xgeqrf(
|
||||
const int* m,
|
||||
const int* n,
|
||||
float* a,
|
||||
const int* lda,
|
||||
float* tau,
|
||||
float* work,
|
||||
const int* lwork,
|
||||
int* info) {
|
||||
sgeqrf_(m, n, a, lda, tau, work, lwork, info);
|
||||
}
|
||||
static void xorgqr(
|
||||
const int* m,
|
||||
const int* n,
|
||||
const int* k,
|
||||
float* a,
|
||||
const int* lda,
|
||||
const float* tau,
|
||||
float* work,
|
||||
const int* lwork,
|
||||
int* info) {
|
||||
sorgqr_(m, n, k, a, lda, tau, work, lwork, info);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void qrf_impl(const array& a, array& q, array& r) {
|
||||
const int M = a.shape(-2);
|
||||
const int N = a.shape(-1);
|
||||
const int lda = std::max(M, N);
|
||||
size_t num_matrices = a.size() / (M * N);
|
||||
int num_reflectors = std::min(M, N);
|
||||
auto tau =
|
||||
allocator::malloc_or_wait(sizeof(T) * num_matrices * num_reflectors);
|
||||
|
||||
// Copy A to inplace input and make it col-contiguous
|
||||
array in(a.shape(), float32, nullptr, {});
|
||||
auto flags = in.flags();
|
||||
|
||||
// Copy the input to be column contiguous
|
||||
flags.col_contiguous = num_matrices == 1;
|
||||
flags.row_contiguous = false;
|
||||
std::vector<size_t> strides = in.strides();
|
||||
strides[in.ndim() - 2] = 1;
|
||||
strides[in.ndim() - 1] = M;
|
||||
in.set_data(
|
||||
allocator::malloc_or_wait(in.nbytes()), in.nbytes(), strides, flags);
|
||||
copy_inplace(a, in, CopyType::GeneralGeneral);
|
||||
|
||||
T optimal_work;
|
||||
int lwork = -1;
|
||||
int info;
|
||||
|
||||
// Compute workspace size
|
||||
lpack<T>::xgeqrf(
|
||||
&M, &N, nullptr, &lda, nullptr, &optimal_work, &lwork, &info);
|
||||
|
||||
// Update workspace size
|
||||
lwork = optimal_work;
|
||||
auto work = allocator::malloc_or_wait(sizeof(T) * lwork);
|
||||
|
||||
// Loop over matrices
|
||||
for (int i = 0; i < num_matrices; ++i) {
|
||||
// Solve
|
||||
lpack<T>::xgeqrf(
|
||||
&M,
|
||||
&N,
|
||||
in.data<float>() + M * N * i,
|
||||
&lda,
|
||||
static_cast<T*>(tau.raw_ptr()) + num_reflectors * i,
|
||||
static_cast<T*>(work.raw_ptr()),
|
||||
&lwork,
|
||||
&info);
|
||||
}
|
||||
allocator::free(work);
|
||||
|
||||
r.set_data(allocator::malloc_or_wait(r.nbytes()));
|
||||
copy_inplace(in, r, CopyType::General);
|
||||
|
||||
for (int i = 0; i < num_matrices; ++i) {
|
||||
// Zero lower triangle
|
||||
for (int j = 0; j < r.shape(-2); ++j) {
|
||||
for (int k = 0; k < j; ++k) {
|
||||
r.data<T>()[i * N * M + j * N + k] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get work size
|
||||
lwork = -1;
|
||||
lpack<T>::xorgqr(
|
||||
&M,
|
||||
&N,
|
||||
&num_reflectors,
|
||||
nullptr,
|
||||
&lda,
|
||||
nullptr,
|
||||
&optimal_work,
|
||||
&lwork,
|
||||
&info);
|
||||
lwork = optimal_work;
|
||||
work = allocator::malloc_or_wait(sizeof(T) * lwork);
|
||||
|
||||
// Loop over matrices
|
||||
for (int i = 0; i < num_matrices; ++i) {
|
||||
// Compute Q
|
||||
lpack<T>::xorgqr(
|
||||
&M,
|
||||
&N,
|
||||
&num_reflectors,
|
||||
in.data<float>() + M * N * i,
|
||||
&lda,
|
||||
static_cast<T*>(tau.raw_ptr()) + num_reflectors * i,
|
||||
static_cast<T*>(work.raw_ptr()),
|
||||
&lwork,
|
||||
&info);
|
||||
}
|
||||
|
||||
q.set_data(allocator::malloc_or_wait(q.nbytes()));
|
||||
copy_inplace(in, q, CopyType::General);
|
||||
|
||||
// Cleanup
|
||||
allocator::free(work);
|
||||
allocator::free(tau);
|
||||
}
|
||||
|
||||
void QRF::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
|
||||
if (!(inputs[0].dtype() == float32)) {
|
||||
throw std::runtime_error("[QRF::eval] only supports float32.");
|
||||
}
|
||||
qrf_impl<float>(inputs[0], outputs[0], outputs[1]);
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
@@ -769,4 +769,10 @@ void Transpose::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
eval(inputs, out);
|
||||
}
|
||||
|
||||
void QRF::eval_gpu(
|
||||
const std::vector<array>& inputs,
|
||||
std::vector<array>& outputs) {
|
||||
throw std::runtime_error("[QRF::eval_gpu] Metal QR factorization NYI.");
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
|
@@ -90,5 +90,5 @@ NO_GPU(Tan)
|
||||
NO_GPU(Tanh)
|
||||
NO_GPU(Transpose)
|
||||
NO_GPU_MULTI(DivMod)
|
||||
|
||||
NO_GPU_MULTI(QRF)
|
||||
} // namespace mlx::core
|
||||
|
@@ -4,8 +4,9 @@
|
||||
#include <ostream>
|
||||
#include <vector>
|
||||
|
||||
#include "mlx/dtype.h"
|
||||
#include "mlx/linalg.h"
|
||||
#include "mlx/primitives.h"
|
||||
#include "mlx/utils.h"
|
||||
|
||||
namespace mlx::core::linalg {
|
||||
|
||||
@@ -172,4 +173,31 @@ array norm(
|
||||
return matrix_norm(a, ord, ax, keepdims, s);
|
||||
}
|
||||
|
||||
std::pair<array, array> qr(const array& a, StreamOrDevice s /* = {} */) {
|
||||
if (a.dtype() != float32) {
|
||||
std::ostringstream msg;
|
||||
msg << "[linalg::qr] Arrays must type float32. Received array "
|
||||
<< "with type " << a.dtype() << ".";
|
||||
throw std::invalid_argument(msg.str());
|
||||
}
|
||||
if (a.ndim() < 2) {
|
||||
std::ostringstream msg;
|
||||
msg << "[linalg::qr] Arrays must have >= 2 dimensions. Received array "
|
||||
"with "
|
||||
<< a.ndim() << " dimensions.";
|
||||
throw std::invalid_argument(msg.str());
|
||||
}
|
||||
if (a.shape(-1) != a.shape(-2)) {
|
||||
throw std::invalid_argument(
|
||||
"[linalg::qr] Support for non-square matrices NYI.");
|
||||
}
|
||||
|
||||
auto out = array::make_arrays(
|
||||
{a.shape(), a.shape()},
|
||||
{a.dtype(), a.dtype()},
|
||||
std::make_unique<QRF>(to_stream(s)),
|
||||
{astype(a, a.dtype(), s)});
|
||||
return std::make_pair(out[0], out[1]);
|
||||
}
|
||||
|
||||
} // namespace mlx::core::linalg
|
||||
|
@@ -60,4 +60,6 @@ norm(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {}) {
|
||||
return norm(a, std::vector<int>{axis}, keepdims, s);
|
||||
}
|
||||
|
||||
std::pair<array, array> qr(const array& a, StreamOrDevice s = {});
|
||||
|
||||
} // namespace mlx::core::linalg
|
||||
|
@@ -252,7 +252,7 @@ array tri(int n, int m, int k, Dtype type, StreamOrDevice s /* = {} */) {
|
||||
return astype(greater_equal(l, r, s), type, s);
|
||||
}
|
||||
|
||||
array tril(array x, int k, StreamOrDevice s /* = {} */) {
|
||||
array tril(array x, int k /* = 0 */, StreamOrDevice s /* = {} */) {
|
||||
if (x.ndim() < 2) {
|
||||
throw std::invalid_argument("[tril] array must be at least 2-D");
|
||||
}
|
||||
@@ -260,7 +260,7 @@ array tril(array x, int k, StreamOrDevice s /* = {} */) {
|
||||
return where(mask, x, zeros_like(x, s), s);
|
||||
}
|
||||
|
||||
array triu(array x, int k, StreamOrDevice s /* = {} */) {
|
||||
array triu(array x, int k /* = 0 */, StreamOrDevice s /* = {} */) {
|
||||
if (x.ndim() < 2) {
|
||||
throw std::invalid_argument("[triu] array must be at least 2-D");
|
||||
}
|
||||
|
@@ -123,8 +123,8 @@ inline array tri(int n, Dtype type, StreamOrDevice s = {}) {
|
||||
return tri(n, n, 0, type, s);
|
||||
}
|
||||
|
||||
array tril(array x, int k, StreamOrDevice s = {});
|
||||
array triu(array x, int k, StreamOrDevice s = {});
|
||||
array tril(array x, int k = 0, StreamOrDevice s = {});
|
||||
array triu(array x, int k = 0, StreamOrDevice s = {});
|
||||
|
||||
/** array manipulation */
|
||||
|
||||
|
@@ -1602,4 +1602,20 @@ class Transpose : public UnaryPrimitive {
|
||||
void eval(const std::vector<array>& inputs, array& out);
|
||||
};
|
||||
|
||||
/* QR Factorization primitive. */
|
||||
class QRF : public Primitive {
|
||||
public:
|
||||
explicit QRF(Stream stream) : Primitive(stream){};
|
||||
|
||||
void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
|
||||
override;
|
||||
void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
|
||||
override;
|
||||
|
||||
DEFINE_PRINT(QRF)
|
||||
|
||||
private:
|
||||
void eval(const std::vector<array>& inputs, std::vector<array>& outputs);
|
||||
};
|
||||
|
||||
} // namespace mlx::core
|
||||
|
Reference in New Issue
Block a user