From e6a7ab967530866eb89c013f833f7c525bec10ca Mon Sep 17 00:00:00 2001
From: Awni Hannun <awni@apple.com>
Date: Tue, 21 Jan 2025 14:07:47 -0800
Subject: [PATCH] non square qr (#1783)

---
 mlx/backend/common/qrf.cpp  | 25 ++++++++++++++++++-------
 mlx/linalg.cpp              | 12 ++++++------
 python/tests/test_linalg.py | 15 +++++++++++++--
 3 files changed, 37 insertions(+), 15 deletions(-)
diff --git a/mlx/backend/common/qrf.cpp b/mlx/backend/common/qrf.cpp
index c1f123aaf..21e9c71f1 100644
--- a/mlx/backend/common/qrf.cpp
+++ b/mlx/backend/common/qrf.cpp
@@ -41,7 +41,7 @@ template <typename T>
 void qrf_impl(const array& a, array& q, array& r) {
   const int M = a.shape(-2);
   const int N = a.shape(-1);
-  const int lda = std::max(M, N);
+  const int lda = M;
   size_t num_matrices = a.size() / (M * N);
   int num_reflectors = std::min(M, N);
   auto tau =
@@ -89,13 +89,16 @@ void qrf_impl(const array& a, array& q, array& r) {
   allocator::free(work);
 
   r.set_data(allocator::malloc_or_wait(r.nbytes()));
-  copy_inplace(in, r, CopyType::General);
 
   for (int i = 0; i < num_matrices; ++i) {
-    // Zero lower triangle
+    /// num_reflectors x N
     for (int j = 0; j < r.shape(-2); ++j) {
       for (int k = 0; k < j; ++k) {
-        r.data<T>()[i * N * M + j * N + k] = 0;
+        r.data<T>()[i * N * num_reflectors + j * N + k] = 0;
+      }
+      for (int k = j; k < r.shape(-1); ++k) {
+        r.data<T>()[i * N * num_reflectors + j * N + k] =
+            in.data<T>()[i * N * M + j + k * M];
       }
     }
   }
@@ -104,7 +107,7 @@ void qrf_impl(const array& a, array& q, array& r) {
   lwork = -1;
   lpack<T>::xorgqr(
       &M,
-      &N,
+      &num_reflectors,
       &num_reflectors,
       nullptr,
       &lda,
@@ -120,7 +123,7 @@ void qrf_impl(const array& a, array& q, array& r) {
     // Compute Q
     lpack<T>::xorgqr(
         &M,
-        &N,
+        &num_reflectors,
         &num_reflectors,
         in.data<float>() + M * N * i,
         &lda,
@@ -131,7 +134,15 @@ void qrf_impl(const array& a, array& q, array& r) {
   }
 
   q.set_data(allocator::malloc_or_wait(q.nbytes()));
-  copy_inplace(in, q, CopyType::General);
+  for (int i = 0; i < num_matrices; ++i) {
+    // M x num_reflectors
+    for (int j = 0; j < q.shape(-2); ++j) {
+      for (int k = 0; k < q.shape(-1); ++k) {
+        q.data<T>()[i * M * num_reflectors + j * num_reflectors + k] =
+            in.data<T>()[i * N * M + j + k * M];
+      }
+    }
+  }
 
   // Cleanup
   allocator::free(work);
diff --git a/mlx/linalg.cpp b/mlx/linalg.cpp
index 5a69f1eae..989dae52f 100644
--- a/mlx/linalg.cpp
+++ b/mlx/linalg.cpp
@@ -187,13 +187,13 @@ std::pair<array, array> qr(const array& a, StreamOrDevice s /* = {} */) {
         << a.ndim() << " dimensions.";
     throw std::invalid_argument(msg.str());
   }
-  if (a.shape(-1) != a.shape(-2)) {
-    throw std::invalid_argument(
-        "[linalg::qr] Support for non-square matrices NYI.");
-  }
-
+  int k = std::min(a.shape(-2), a.shape(-1));
+  auto q_shape = a.shape();
+  q_shape.back() = k;
+  auto r_shape = a.shape();
+  r_shape[r_shape.size() - 2] = k;
   auto out = array::make_arrays(
-      {a.shape(), a.shape()},
+      {std::move(q_shape), std::move(r_shape)},
       {a.dtype(), a.dtype()},
       std::make_shared<QRF>(to_stream(s)),
       {astype(a, a.dtype(), s)});
diff --git a/python/tests/test_linalg.py b/python/tests/test_linalg.py
index 695d7704f..c4ac69178 100644
--- a/python/tests/test_linalg.py
+++ b/python/tests/test_linalg.py
@@ -103,7 +103,7 @@ class TestLinalg(mlx_tests.MLXTestCase):
         Q, R = mx.linalg.qr(A, stream=mx.cpu)
         out = Q @ R
         self.assertTrue(mx.allclose(out, A))
-        out = Q @ Q
+        out = Q.T @ Q
         self.assertTrue(mx.allclose(out, mx.eye(2), rtol=1e-5, atol=1e-7))
         self.assertTrue(mx.allclose(mx.tril(R, -1), mx.zeros_like(R)))
         self.assertEqual(Q.dtype, mx.float32)
@@ -116,10 +116,21 @@ class TestLinalg(mlx_tests.MLXTestCase):
         for a, q, r in zip(A, Q, R):
             out = q @ r
             self.assertTrue(mx.allclose(out, a))
-            out = q @ q
+            out = q.T @ q
             self.assertTrue(mx.allclose(out, mx.eye(2), rtol=1e-5, atol=1e-7))
             self.assertTrue(mx.allclose(mx.tril(r, -1), mx.zeros_like(r)))
 
+        # Non square matrices
+        for shape in [(4, 8), (8, 4)]:
+            A = mx.random.uniform(shape=shape)
+            Q, R = mx.linalg.qr(A, stream=mx.cpu)
+            out = Q @ R
+            self.assertTrue(mx.allclose(out, A, rtol=1e-4, atol=1e-6))
+            out = Q.T @ Q
+            self.assertTrue(
+                mx.allclose(out, mx.eye(min(A.shape)), rtol=1e-4, atol=1e-6)
+            )
+
     def test_svd_decomposition(self):
         A = mx.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=mx.float32)
         U, S, Vt = mx.linalg.svd(A, stream=mx.cpu)