From e6a7ab967530866eb89c013f833f7c525bec10ca Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Tue, 21 Jan 2025 14:07:47 -0800 Subject: [PATCH] non square qr (#1783) --- mlx/backend/common/qrf.cpp | 25 ++++++++++++++++++------- mlx/linalg.cpp | 12 ++++++------ python/tests/test_linalg.py | 15 +++++++++++++-- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/mlx/backend/common/qrf.cpp b/mlx/backend/common/qrf.cpp index c1f123aaf..21e9c71f1 100644 --- a/mlx/backend/common/qrf.cpp +++ b/mlx/backend/common/qrf.cpp @@ -41,7 +41,7 @@ template void qrf_impl(const array& a, array& q, array& r) { const int M = a.shape(-2); const int N = a.shape(-1); - const int lda = std::max(M, N); + const int lda = M; size_t num_matrices = a.size() / (M * N); int num_reflectors = std::min(M, N); auto tau = @@ -89,13 +89,16 @@ void qrf_impl(const array& a, array& q, array& r) { allocator::free(work); r.set_data(allocator::malloc_or_wait(r.nbytes())); - copy_inplace(in, r, CopyType::General); for (int i = 0; i < num_matrices; ++i) { - // Zero lower triangle + /// num_reflectors x N for (int j = 0; j < r.shape(-2); ++j) { for (int k = 0; k < j; ++k) { - r.data()[i * N * M + j * N + k] = 0; + r.data()[i * N * num_reflectors + j * N + k] = 0; + } + for (int k = j; k < r.shape(-1); ++k) { + r.data()[i * N * num_reflectors + j * N + k] = + in.data()[i * N * M + j + k * M]; } } } @@ -104,7 +107,7 @@ void qrf_impl(const array& a, array& q, array& r) { lwork = -1; lpack::xorgqr( &M, - &N, + &num_reflectors, &num_reflectors, nullptr, &lda, @@ -120,7 +123,7 @@ void qrf_impl(const array& a, array& q, array& r) { // Compute Q lpack::xorgqr( &M, - &N, + &num_reflectors, &num_reflectors, in.data() + M * N * i, &lda, @@ -131,7 +134,15 @@ void qrf_impl(const array& a, array& q, array& r) { } q.set_data(allocator::malloc_or_wait(q.nbytes())); - copy_inplace(in, q, CopyType::General); + for (int i = 0; i < num_matrices; ++i) { + // M x num_reflectors + for (int j = 0; j < q.shape(-2); ++j) { + for (int k = 0; k < q.shape(-1); ++k) { + q.data()[i * M * num_reflectors + j * num_reflectors + k] = + in.data()[i * N * M + j + k * M]; + } + } + } // Cleanup allocator::free(work); diff --git a/mlx/linalg.cpp b/mlx/linalg.cpp index 5a69f1eae..989dae52f 100644 --- a/mlx/linalg.cpp +++ b/mlx/linalg.cpp @@ -187,13 +187,13 @@ std::pair qr(const array& a, StreamOrDevice s /* = {} */) { << a.ndim() << " dimensions."; throw std::invalid_argument(msg.str()); } - if (a.shape(-1) != a.shape(-2)) { - throw std::invalid_argument( - "[linalg::qr] Support for non-square matrices NYI."); - } - + int k = std::min(a.shape(-2), a.shape(-1)); + auto q_shape = a.shape(); + q_shape.back() = k; + auto r_shape = a.shape(); + r_shape[r_shape.size() - 2] = k; auto out = array::make_arrays( - {a.shape(), a.shape()}, + {std::move(q_shape), std::move(r_shape)}, {a.dtype(), a.dtype()}, std::make_shared(to_stream(s)), {astype(a, a.dtype(), s)}); diff --git a/python/tests/test_linalg.py b/python/tests/test_linalg.py index 695d7704f..c4ac69178 100644 --- a/python/tests/test_linalg.py +++ b/python/tests/test_linalg.py @@ -103,7 +103,7 @@ class TestLinalg(mlx_tests.MLXTestCase): Q, R = mx.linalg.qr(A, stream=mx.cpu) out = Q @ R self.assertTrue(mx.allclose(out, A)) - out = Q @ Q + out = Q.T @ Q self.assertTrue(mx.allclose(out, mx.eye(2), rtol=1e-5, atol=1e-7)) self.assertTrue(mx.allclose(mx.tril(R, -1), mx.zeros_like(R))) self.assertEqual(Q.dtype, mx.float32) @@ -116,10 +116,21 @@ class TestLinalg(mlx_tests.MLXTestCase): for a, q, r in zip(A, Q, R): out = q @ r self.assertTrue(mx.allclose(out, a)) - out = q @ q + out = q.T @ q self.assertTrue(mx.allclose(out, mx.eye(2), rtol=1e-5, atol=1e-7)) self.assertTrue(mx.allclose(mx.tril(r, -1), mx.zeros_like(r))) + # Non square matrices + for shape in [(4, 8), (8, 4)]: + A = mx.random.uniform(shape=shape) + Q, R = mx.linalg.qr(A, stream=mx.cpu) + out = Q @ R + self.assertTrue(mx.allclose(out, A, rtol=1e-4, atol=1e-6)) + out = Q.T @ Q + self.assertTrue( + mx.allclose(out, mx.eye(min(A.shape)), rtol=1e-4, atol=1e-6) + ) + def test_svd_decomposition(self): A = mx.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=mx.float32) U, S, Vt = mx.linalg.svd(A, stream=mx.cpu)