Add matrix inversion primitive (#822)

2025-12-12 23:39:04 +08:00 · 2024-03-15 14:34:36 +01:00
parent 19ec023256
commit eaba55c9bf
13 changed files with 204 additions and 4 deletions
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -74,6 +74,7 @@ DEFAULT(Sort)
 DEFAULT(StopGradient)
 DEFAULT_MULTI(SVD)
 DEFAULT(Transpose)
 DEFAULT(Inverse)
 void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -54,6 +54,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
  ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
 )
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -105,6 +105,7 @@ DEFAULT_MULTI(SVD)
 DEFAULT(Tan)
 DEFAULT(Tanh)
 DEFAULT(Transpose)
 DEFAULT(Inverse)
 namespace {
--- a/mlx/backend/common/inverse.cpp
+++ b/mlx/backend/common/inverse.cpp
@@ -0,0 +1,95 @@
 // Copyright © 2023-2024 Apple Inc.
 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/primitives.h"
 #ifdef ACCELERATE_NEW_LAPACK
 #include <Accelerate/Accelerate.h>
 #else
 #include <lapack.h>
 #endif
 namespace mlx::core {
 void inverse_impl(const array& a, array& inv) {
  // Lapack uses the column-major convention. We take advantage of the following
  // identity to avoid transposing (see
  // https://math.stackexchange.com/a/340234):
  //   (A⁻¹)ᵀ = (Aᵀ)⁻¹
  // The inverse is computed in place, so just copy the input to the output.
  copy(a, inv, a.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  const int N = a.shape(-1);
  const size_t num_matrices = a.size() / (N * N);
  int info;
  auto ipiv = array::Data{allocator::malloc_or_wait(sizeof(int) * N)};
  for (int i = 0; i < num_matrices; i++) {
    // Compute LU factorization.
    sgetrf_(
        /* m = */ &N,
        /* n = */ &N,
        /* a = */ inv.data<float>() + N * N * i,
        /* lda = */ &N,
        /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
        /* info = */ &info);
    if (info != 0) {
      std::stringstream ss;
      ss << "inverse_impl: LU factorization failed with error code " << info;
      throw std::runtime_error(ss.str());
    }
    static const int lwork_query = -1;
    float workspace_size = 0;
    // Compute workspace size.
    sgetri_(
        /* m = */ &N,
        /* a = */ nullptr,
        /* lda = */ &N,
        /* ipiv = */ nullptr,
        /* work = */ &workspace_size,
        /* lwork = */ &lwork_query,
        /* info = */ &info);
    if (info != 0) {
      std::stringstream ss;
      ss << "inverse_impl: LU workspace calculation failed with error code "
         << info;
      throw std::runtime_error(ss.str());
    }
    const int lwork = workspace_size;
    auto scratch =
        array::Data{allocator::malloc_or_wait(sizeof(float) * lwork)};
    // Compute inverse.
    sgetri_(
        /* m = */ &N,
        /* a = */ inv.data<float>() + N * N * i,
        /* lda = */ &N,
        /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
        /* work = */ static_cast<float*>(scratch.buffer.raw_ptr()),
        /* lwork = */ &lwork,
        /* info = */ &info);
    if (info != 0) {
      std::stringstream ss;
      ss << "inverse_impl: inversion failed with error code " << info;
      throw std::runtime_error(ss.str());
    }
  }
 }
 void Inverse::eval(const std::vector<array>& inputs, array& output) {
  if (inputs[0].dtype() != float32) {
    throw std::runtime_error("[Inverse::eval] only supports float32.");
  }
  inverse_impl(inputs[0], output);
 }
 } // namespace mlx::core
--- a/mlx/backend/common/svd.cpp
+++ b/mlx/backend/common/svd.cpp
@@ -49,8 +49,7 @@ void svd_impl(const array& a, array& u, array& s, array& vt) {
  // Will contain the indices of eigenvectors that failed to converge (not used
  // here but required by lapack).
-  std::vector<int> iwork;
+  auto iwork = array::Data{allocator::malloc_or_wait(sizeof(int) * 12 * K)};
  iwork.resize(12 * K);
  static const int lwork_query = -1;
@@ -82,7 +81,7 @@ void svd_impl(const array& a, array& u, array& s, array& vt) {
      /* ldvt = */ &ldvt,
      /* work = */ &workspace_dimension,
      /* lwork = */ &lwork_query,
-      /* iwork = */ iwork.data(),
+      /* iwork = */ static_cast<int*>(iwork.buffer.raw_ptr()),
      /* info = */ &info);
  if (info != 0) {
@@ -120,7 +119,7 @@ void svd_impl(const array& a, array& u, array& s, array& vt) {
        /* ldvt = */ &ldvt,
        /* work = */ static_cast<float*>(scratch.buffer.raw_ptr()),
        /* lwork = */ &lwork,
-        /* iwork = */ iwork.data(),
+        /* iwork = */ static_cast<int*>(iwork.buffer.raw_ptr()),
        /* info = */ &info);
    if (info != 0) {
--- a/mlx/backend/metal/primitives.cpp
+++ b/mlx/backend/metal/primitives.cpp
@@ -900,4 +900,8 @@ void SVD::eval_gpu(
  throw std::runtime_error("[SVD::eval_gpu] Metal SVD NYI.");
 }
 void Inverse::eval_gpu(const std::vector<array>& inputs, array& output) {
  throw std::runtime_error("[Inverse::eval_gpu] Metal inversion NYI.");
 }
 } // namespace mlx::core
--- a/mlx/backend/no_metal/primitives.cpp
+++ b/mlx/backend/no_metal/primitives.cpp
@@ -98,6 +98,7 @@ NO_GPU_MULTI(SVD)
 NO_GPU(Tan)
 NO_GPU(Tanh)
 NO_GPU(Transpose)
 NO_GPU(Inverse)
 namespace fast {
 NO_GPU_MULTI(RoPE)
--- a/mlx/linalg.cpp
+++ b/mlx/linalg.cpp
@@ -238,4 +238,27 @@ std::vector<array> svd(const array& a, StreamOrDevice s /* = {} */) {
      {a});
 }
 array inv(const array& a, StreamOrDevice s /* = {} */) {
  if (a.dtype() != float32) {
    std::ostringstream msg;
    msg << "[linalg::inv] Arrays must type float32. Received array "
        << "with type " << a.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (a.ndim() < 2) {
    std::ostringstream msg;
    msg << "[linalg::inv] Arrays must have >= 2 dimensions. Received array "
           "with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  if (a.shape(-1) != a.shape(-2)) {
    throw std::invalid_argument(
        "[linalg::inv] Inverses are only defined for square matrices.");
  }
  return array(
      a.shape(), a.dtype(), std::make_unique<Inverse>(to_stream(s)), {a});
 }
 } // namespace mlx::core::linalg
--- a/mlx/linalg.h
+++ b/mlx/linalg.h
@@ -64,4 +64,6 @@ std::pair<array, array> qr(const array& a, StreamOrDevice s = {});
 std::vector<array> svd(const array& a, StreamOrDevice s = {});
 array inv(const array& a, StreamOrDevice s = {});
 } // namespace mlx::core::linalg
--- a/mlx/primitives.h
+++ b/mlx/primitives.h
@@ -1897,4 +1897,18 @@ class SVD : public Primitive {
  void eval(const std::vector<array>& inputs, std::vector<array>& outputs);
 };
 /* Matrix inversion primitive. */
 class Inverse : public UnaryPrimitive {
 public:
  explicit Inverse(Stream stream) : UnaryPrimitive(stream){};
  void eval_cpu(const std::vector<array>& inputs, array& output) override;
  void eval_gpu(const std::vector<array>& inputs, array& output) override;
  DEFINE_PRINT(Inverse)
 private:
  void eval(const std::vector<array>& inputs, array& output);
 };
 } // namespace mlx::core
--- a/python/src/linalg.cpp
+++ b/python/src/linalg.cpp
@@ -241,4 +241,27 @@ void init_linalg(py::module_& parent_module) {
            tuple(array, array, array): The ``U``, ``S``, and ``Vt`` matrices, such that
              ``A = U @ diag(S) @ Vt``
      )pbdoc");
  m.def(
      "inv",
      &inv,
      "a"_a,
      py::kw_only(),
      "stream"_a = none,
      R"pbdoc(
        inv(a: array, *, stream: Union[None, Stream, Device] = None) -> array
        Compute the inverse of a square matrix.
        This function supports arrays with at least 2 dimensions. When the input
        has more than two dimensions, the inverse is computed for each matrix
        in the last two dimensions of ``a``.
        Args:
            a (array): Input array.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.
        Returns:
            array: ``ainv`` such that ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``
      )pbdoc");
 }
--- a/python/tests/test_linalg.py
+++ b/python/tests/test_linalg.py
@@ -136,6 +136,20 @@ class TestLinalg(mlx_tests.MLXTestCase):
                mx.allclose(U[:, : len(S)] @ mx.diag(S) @ Vt, M, rtol=1e-5, atol=1e-7)
            )
    def test_inverse(self):
        A = mx.array([[1, 2, 3], [6, -5, 4], [-9, 8, 7]], dtype=mx.float32)
        A_inv = mx.linalg.inv(A, stream=mx.cpu)
        self.assertTrue(mx.allclose(A @ A_inv, mx.eye(A.shape[0]), rtol=0, atol=1e-6))
        # Multiple matrices
        B = A - 100
        AB = mx.stack([A, B])
        invs = mx.linalg.inv(AB, stream=mx.cpu)
        for M, M_inv in zip(AB, invs):
            self.assertTrue(
                mx.allclose(M @ M_inv, mx.eye(M.shape[0]), rtol=0, atol=1e-5)
            )
 if __name__ == "__main__":
    unittest.main()
--- a/tests/linalg_tests.cpp
+++ b/tests/linalg_tests.cpp
@@ -300,3 +300,25 @@ TEST_CASE("test SVD factorization") {
  CHECK_EQ(S.dtype(), float32);
  CHECK_EQ(Vt.dtype(), float32);
 }
 TEST_CASE("test matrix inversion") {
  // 0D and 1D throw
  CHECK_THROWS(linalg::inv(array(0.0), Device::cpu));
  CHECK_THROWS(linalg::inv(array({0.0, 1.0}), Device::cpu));
  // Unsupported types throw
  CHECK_THROWS(linalg::inv(array({0, 1}, {1, 2}), Device::cpu));
  // Non-square throws.
  CHECK_THROWS(linalg::inv(array({1, 2, 3, 4, 5, 6}, {2, 3}), Device::cpu));
  const auto prng_key = random::key(42);
  const auto A = random::normal({5, 5}, prng_key);
  const auto A_inv = linalg::inv(A, Device::cpu);
  const auto identity = eye(A.shape(0));
  CHECK(allclose(matmul(A, A_inv), identity, /* rtol = */ 0, /* atol = */ 1e-6)
            .item<bool>());
  CHECK(allclose(matmul(A_inv, A), identity, /* rtol = */ 0, /* atol = */ 1e-6)
            .item<bool>());
 }