CPU mx.linalg.cholesky_inverse and mx.linalg.tri_inv (#1307)

* add cholesky inv + tri inv * always run tri_inv on cpu * consistent naming
2025-12-16 01:49:05 +08:00 · 2024-08-08 15:18:02 -07:00
parent 780c197f95
commit 32668a7317
7 changed files with 267 additions and 62 deletions
--- a/mlx/backend/common/inverse.cpp
+++ b/mlx/backend/common/inverse.cpp
@@ -10,9 +10,106 @@
 #include <lapack.h>
 #endif

+// Wrapper to account for differences in
+// LAPACK implementations (basically how to pass the 'uplo' string to fortran).
+int strtri_wrapper(char uplo, char diag, float* matrix, int N) {
+  int info;
+
+#ifdef LAPACK_FORTRAN_STRLEN_END
+  strtri_(
+      /* uplo = */ &uplo,
+      /* diag = */ &diag,
+      /* N = */ &N,
+      /* a = */ matrix,
+      /* lda = */ &N,
+      /* info = */ &info,
+      /* uplo_len = */ static_cast<size_t>(1),
+      /* diag_len = */ static_cast<size_t>(1));
+#else
+  strtri_(
+      /* uplo = */ &uplo,
+      /* diag = */ &diag,
+      /* N = */ &N,
+      /* a = */ matrix,
+      /* lda = */ &N,
+      /* info = */ &info);
+#endif
+
+  return info;
+}
+
 namespace mlx::core {

-void inverse_impl(const array& a, array& inv) {
+void general_inv(array& inv, int N, int i) {
+  int info;
+  auto ipiv = array::Data{allocator::malloc_or_wait(sizeof(int) * N)};
+  // Compute LU factorization.
+  sgetrf_(
+      /* m = */ &N,
+      /* n = */ &N,
+      /* a = */ inv.data<float>() + N * N * i,
+      /* lda = */ &N,
+      /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
+      /* info = */ &info);
+
+  if (info != 0) {
+    std::stringstream ss;
+    ss << "inverse_impl: LU factorization failed with error code " << info;
+    throw std::runtime_error(ss.str());
+  }
+
+  static const int lwork_query = -1;
+  float workspace_size = 0;
+
+  // Compute workspace size.
+  sgetri_(
+      /* m = */ &N,
+      /* a = */ nullptr,
+      /* lda = */ &N,
+      /* ipiv = */ nullptr,
+      /* work = */ &workspace_size,
+      /* lwork = */ &lwork_query,
+      /* info = */ &info);
+
+  if (info != 0) {
+    std::stringstream ss;
+    ss << "inverse_impl: LU workspace calculation failed with error code "
+       << info;
+    throw std::runtime_error(ss.str());
+  }
+
+  const int lwork = workspace_size;
+  auto scratch = array::Data{allocator::malloc_or_wait(sizeof(float) * lwork)};
+
+  // Compute inverse.
+  sgetri_(
+      /* m = */ &N,
+      /* a = */ inv.data<float>() + N * N * i,
+      /* lda = */ &N,
+      /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
+      /* work = */ static_cast<float*>(scratch.buffer.raw_ptr()),
+      /* lwork = */ &lwork,
+      /* info = */ &info);
+
+  if (info != 0) {
+    std::stringstream ss;
+    ss << "inverse_impl: inversion failed with error code " << info;
+    throw std::runtime_error(ss.str());
+  }
+}
+
+void tri_inv(array& inv, int N, int i, bool upper) {
+  const char uplo = upper ? 'L' : 'U';
+  const char diag = 'N';
+  int info = strtri_wrapper(uplo, diag, inv.data<float>() + N * N * i, N);
+  if (info != 0) {
+    std::stringstream ss;
+    ss << "inverse_impl: triangular inversion failed with error code " << info;
+    throw std::runtime_error(ss.str());
+  }
+}
+
+void inverse_impl(const array& a, array& inv, bool tri, bool upper) {
  // Lapack uses the column-major convention. We take advantage of the following
  // identity to avoid transposing (see
  // https://math.stackexchange.com/a/340234):
@@ -24,63 +121,11 @@ void inverse_impl(const array& a, array& inv) {
  const int N = a.shape(-1);
  const size_t num_matrices = a.size() / (N * N);

-  int info;
-  auto ipiv = array::Data{allocator::malloc_or_wait(sizeof(int) * N)};
-
  for (int i = 0; i < num_matrices; i++) {
-    // Compute LU factorization.
-    sgetrf_(
-        /* m = */ &N,
-        /* n = */ &N,
-        /* a = */ inv.data<float>() + N * N * i,
-        /* lda = */ &N,
-        /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
-        /* info = */ &info);
-
-    if (info != 0) {
-      std::stringstream ss;
-      ss << "inverse_impl: LU factorization failed with error code " << info;
-      throw std::runtime_error(ss.str());
-    }
-
-    static const int lwork_query = -1;
-    float workspace_size = 0;
-
-    // Compute workspace size.
-    sgetri_(
-        /* m = */ &N,
-        /* a = */ nullptr,
-        /* lda = */ &N,
-        /* ipiv = */ nullptr,
-        /* work = */ &workspace_size,
-        /* lwork = */ &lwork_query,
-        /* info = */ &info);
-
-    if (info != 0) {
-      std::stringstream ss;
-      ss << "inverse_impl: LU workspace calculation failed with error code "
-         << info;
-      throw std::runtime_error(ss.str());
-    }
-
-    const int lwork = workspace_size;
-    auto scratch =
-        array::Data{allocator::malloc_or_wait(sizeof(float) * lwork)};
-
-    // Compute inverse.
-    sgetri_(
-        /* m = */ &N,
-        /* a = */ inv.data<float>() + N * N * i,
-        /* lda = */ &N,
-        /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
-        /* work = */ static_cast<float*>(scratch.buffer.raw_ptr()),
-        /* lwork = */ &lwork,
-        /* info = */ &info);
-
-    if (info != 0) {
-      std::stringstream ss;
-      ss << "inverse_impl: inversion failed with error code " << info;
-      throw std::runtime_error(ss.str());
+    if (tri) {
+      tri_inv(inv, N, i, upper);
+    } else {
+      general_inv(inv, N, i);
    }
  }
 }
@@ -89,7 +134,7 @@ void Inverse::eval(const std::vector<array>& inputs, array& output) {
  if (inputs[0].dtype() != float32) {
    throw std::runtime_error("[Inverse::eval] only supports float32.");
  }
-  inverse_impl(inputs[0], output);
+  inverse_impl(inputs[0], output, tri_, upper_);
 }

 } // namespace mlx::core
--- a/mlx/linalg.cpp
+++ b/mlx/linalg.cpp
@@ -238,7 +238,7 @@ std::vector<array> svd(const array& a, StreamOrDevice s /* = {} */) {
      {a});
 }

-array inv(const array& a, StreamOrDevice s /* = {} */) {
+array inv_impl(const array& a, bool tri, bool upper, StreamOrDevice s) {
  if (a.dtype() != float32) {
    std::ostringstream msg;
    msg << "[linalg::inv] Arrays must type float32. Received array "
@@ -258,7 +258,21 @@ array inv(const array& a, StreamOrDevice s /* = {} */) {
  }

  return array(
-      a.shape(), a.dtype(), std::make_shared<Inverse>(to_stream(s)), {a});
+      a.shape(),
+      a.dtype(),
+      std::make_shared<Inverse>(to_stream(s), tri, upper),
+      {a});
+}
+
+array inv(const array& a, StreamOrDevice s /* = {} */) {
+  return inv_impl(a, /*tri=*/false, /*upper=*/true, s);
+}
+
+array tri_inv(
+    const array& a,
+    bool upper /* = true */,
+    StreamOrDevice s /* = {} */) {
+  return inv_impl(a, /*tri=*/true, upper, s);
 }

 array cholesky(
@@ -292,4 +306,37 @@ array cholesky(
      {a});
 }

+array cholesky_inv(
+    const array& L,
+    bool upper /* = false */,
+    StreamOrDevice s /* = {} */) {
+  if (L.dtype() != float32) {
+    std::ostringstream msg;
+    msg << "[linalg::cholesky] Arrays must type float32. Received array "
+        << "with type " << L.dtype() << ".";
+    throw std::invalid_argument(msg.str());
+  }
+
+  if (L.ndim() < 2) {
+    std::ostringstream msg;
+    msg << "[linalg::cholesky] Arrays must have >= 2 dimensions. Received array "
+           "with "
+        << L.ndim() << " dimensions.";
+    throw std::invalid_argument(msg.str());
+  }
+
+  if (L.shape(-1) != L.shape(-2)) {
+    throw std::invalid_argument(
+        "[linalg::cholesky] Cholesky inverse is only defined for square "
+        "matrices.");
+  }
+
+  array L_inv = tri_inv(L, upper, s);
+  if (upper) {
+    return matmul(L_inv, swapaxes(L_inv, -1, -2, s), s);
+  } else {
+    return matmul(swapaxes(L_inv, -1, -2, s), L_inv, s);
+  }
+}
+
 } // namespace mlx::core::linalg
--- a/mlx/linalg.h
+++ b/mlx/linalg.h
@@ -66,6 +66,10 @@ std::vector<array> svd(const array& a, StreamOrDevice s = {});

 array inv(const array& a, StreamOrDevice s = {});

+array tri_inv(const array& a, bool upper = false, StreamOrDevice s = {});
+
 array cholesky(const array& a, bool upper = false, StreamOrDevice s = {});

+array cholesky_inv(const array& a, bool upper = false, StreamOrDevice s = {});
+
 } // namespace mlx::core::linalg
--- a/mlx/primitives.h
+++ b/mlx/primitives.h
@@ -2127,7 +2127,8 @@ class SVD : public Primitive {
 /* Matrix inversion primitive. */
 class Inverse : public UnaryPrimitive {
 public:
-  explicit Inverse(Stream stream) : UnaryPrimitive(stream) {}
+  explicit Inverse(Stream stream, bool tri, bool upper)
+      : UnaryPrimitive(stream), tri_(tri), upper_(upper) {}

  void eval_cpu(const std::vector<array>& inputs, array& output) override;
  void eval_gpu(const std::vector<array>& inputs, array& output) override;
@@ -2137,6 +2138,8 @@ class Inverse : public UnaryPrimitive {

 private:
  void eval(const std::vector<array>& inputs, array& output);
+  bool tri_;
+  bool upper_;
 };

 class Cholesky : public UnaryPrimitive {