CPU LU factorization and linear solvers (#1451)

* linalg solve backend * nits * more nits + fix * luf primitive and lu, solve, and solve_triangular backends * changes / nits --------- Co-authored-by: Awni Hannun <awni@apple.com>
2025-12-10 22:46:53 +08:00 · 2025-02-10 14:32:24 -06:00
parent 7df3f792a2
commit a5ededf1c3
12 changed files with 571 additions and 15 deletions
--- a/python/src/linalg.cpp
+++ b/python/src/linalg.cpp
@@ -14,13 +14,6 @@ namespace mx = mlx::core;
 namespace nb = nanobind;
 using namespace nb::literals;

-namespace {
-nb::tuple svd_helper(const mx::array& a, mx::StreamOrDevice s /* = {} */) {
-  const auto result = mx::linalg::svd(a, s);
-  return nb::make_tuple(result.at(0), result.at(1), result.at(2));
-}
-} // namespace
-
 void init_linalg(nb::module_& parent_module) {
  auto m = parent_module.def_submodule(
      "linalg", "mlx.core.linalg: linear algebra routines.");
@@ -213,7 +206,10 @@ void init_linalg(nb::module_& parent_module) {
      )pbdoc");
  m.def(
      "svd",
-      &svd_helper,
+      [](const mx::array& a, mx::StreamOrDevice s /* = {} */) {
+        const auto result = mx::linalg::svd(a, s);
+        return nb::make_tuple(result.at(0), result.at(1), result.at(2));
+      },
      "a"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
@@ -262,7 +258,7 @@ void init_linalg(nb::module_& parent_module) {
      "tri_inv",
      &mx::linalg::tri_inv,
      "a"_a,
-      "upper"_a,
+      "upper"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
@@ -276,7 +272,7 @@ void init_linalg(nb::module_& parent_module) {

        Args:
            a (array): Input array.
-            upper (array): Whether the array is upper or lower triangular. Defaults to ``False``.
+            upper (bool, optional): Whether the array is upper or lower triangular. Defaults to ``False``.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

@@ -441,7 +437,6 @@ void init_linalg(nb::module_& parent_module) {
  m.def(
      "eigh",
      [](const mx::array& a, const std::string UPLO, mx::StreamOrDevice s) {
-        // TODO avoid cast?
        auto result = mx::linalg::eigh(a, UPLO, s);
        return nb::make_tuple(result.first, result.second);
      },
@@ -484,4 +479,102 @@ void init_linalg(nb::module_& parent_module) {
            array([[ 0.707107, -0.707107],
                  [ 0.707107,  0.707107]], dtype=float32)
      )pbdoc");
+  m.def(
+      "lu",
+      [](const mx::array& a, mx::StreamOrDevice s /* = {} */) {
+        auto result = mx::linalg::lu(a, s);
+        return nb::make_tuple(result.at(0), result.at(1), result.at(2));
+      },
+      "a"_a,
+      nb::kw_only(),
+      "stream"_a = nb::none(),
+      nb::sig(
+          "def lu(a: array, *, stream: Union[None, Stream, Device] = None) -> Tuple[array, array, array]"),
+      R"pbdoc(
+        Compute the LU factorization of the given matrix ``A``.
+
+        Note, unlike the default behavior of ``scipy.linalg.lu``, the pivots
+        are indices. To reconstruct the input use ``L[P, :] @ U`` for 2
+        dimensions or ``mx.take_along_axis(L, P[..., None], axis=-2) @ U``
+        for more than 2 dimensions.
+
+        To construct the full permuation matrix do:
+
+        .. code-block::
+
+          P = mx.put_along_axis(mx.zeros_like(L), p[..., None], mx.array(1.0), axis=-1)
+
+        Args:
+            a (array): Input array.
+            stream (Stream, optional): Stream or device. Defaults to ``None``
+              in which case the default stream of the default device is used.
+
+        Returns:
+            tuple(array, array, array):
+              The ``p``, ``L``, and ``U`` arrays, such that ``A = L[P, :] @ U``
+      )pbdoc");
+  m.def(
+      "lu_factor",
+      &mx::linalg::lu_factor,
+      "a"_a,
+      nb::kw_only(),
+      "stream"_a = nb::none(),
+      nb::sig(
+          "def lu_factor(a: array, *, stream: Union[None, Stream, Device] = None) -> Tuple[array, array]"),
+      R"pbdoc(
+        Computes a compact representation of the LU factorization.
+
+        Args:
+            a (array): Input array.
+            stream (Stream, optional): Stream or device. Defaults to ``None``
+              in which case the default stream of the default device is used.
+
+        Returns:
+            tuple(array, array): The ``LU`` matrix and ``pivots`` array.
+      )pbdoc");
+  m.def(
+      "solve",
+      &mx::linalg::solve,
+      "a"_a,
+      "b"_a,
+      nb::kw_only(),
+      "stream"_a = nb::none(),
+      nb::sig(
+          "def solve(a: array, b: array, *, stream: Union[None, Stream, Device] = None) -> array"),
+      R"pbdoc(
+        Compute the solution to a system of linear equations ``AX = B``.
+
+        Args:
+            a (array): Input array.
+            b (array): Input array.
+            stream (Stream, optional): Stream or device. Defaults to ``None``
+              in which case the default stream of the default device is used.
+
+        Returns:
+            array: The unique solution to the system ``AX = B``.
+      )pbdoc");
+  m.def(
+      "solve_triangular",
+      &mx::linalg::solve_triangular,
+      "a"_a,
+      "b"_a,
+      nb::kw_only(),
+      "upper"_a = false,
+      "stream"_a = nb::none(),
+      nb::sig(
+          "def solve_triangular(a: array, b: array, *, upper: bool = False, stream: Union[None, Stream, Device] = None) -> array"),
+      R"pbdoc(
+        Computes the solution of a triangular system of linear equations ``AX = B``.
+
+        Args:
+            a (array): Input array.
+            b (array): Input array.
+            upper (bool, optional): Whether the array is upper or lower
+              triangular. Default: ``False``.
+            stream (Stream, optional): Stream or device. Defaults to ``None``
+              in which case the default stream of the default device is used.
+
+        Returns:
+            array: The unique solution to the system ``AX = B``.
+      )pbdoc");
 }
--- a/python/tests/test_linalg.py
+++ b/python/tests/test_linalg.py
@@ -330,6 +330,123 @@ class TestLinalg(mlx_tests.MLXTestCase):
                mx.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
            )  # Non-square matrix

+    def test_lu(self):
+        with self.assertRaises(ValueError):
+            mx.linalg.lu(mx.array(0.0), stream=mx.cpu)
+
+        with self.assertRaises(ValueError):
+            mx.linalg.lu(mx.array([0.0, 1.0]), stream=mx.cpu)
+
+        with self.assertRaises(ValueError):
+            mx.linalg.lu(mx.array([[0, 1], [1, 0]]), stream=mx.cpu)
+
+        # Test 3x3 matrix
+        a = mx.array([[3.0, 1.0, 2.0], [1.0, 8.0, 6.0], [9.0, 2.0, 5.0]])
+        P, L, U = mx.linalg.lu(a, stream=mx.cpu)
+        self.assertTrue(mx.allclose(L[P, :] @ U, a))
+
+        # Test batch dimension
+        a = mx.broadcast_to(a, (5, 5, 3, 3))
+        P, L, U = mx.linalg.lu(a, stream=mx.cpu)
+        L = mx.take_along_axis(L, P[..., None], axis=-2)
+        self.assertTrue(mx.allclose(L @ U, a))
+
+    def test_lu_factor(self):
+        mx.random.seed(7)
+
+        # Test 3x3 matrix
+        a = mx.random.uniform(shape=(5, 5))
+        LU, pivots = mx.linalg.lu_factor(a, stream=mx.cpu)
+        n = a.shape[-1]
+
+        pivots = pivots.tolist()
+        perm = list(range(n))
+        for i in range(len(pivots)):
+            perm[i], perm[pivots[i]] = perm[pivots[i]], perm[i]
+
+        L = mx.add(mx.tril(LU, k=-1), mx.eye(n))
+        U = mx.triu(LU)
+        self.assertTrue(mx.allclose(L @ U, a[perm, :]))
+
+    def test_solve(self):
+        mx.random.seed(7)
+
+        # Test 3x3 matrix with 1D rhs
+        a = mx.array([[3.0, 1.0, 2.0], [1.0, 8.0, 6.0], [9.0, 2.0, 5.0]])
+        b = mx.array([11.0, 35.0, 28.0])
+
+        result = mx.linalg.solve(a, b, stream=mx.cpu)
+        expected = np.linalg.solve(a, b)
+        self.assertTrue(np.allclose(result, expected))
+
+        # Test symmetric positive-definite matrix
+        N = 5
+        a = mx.random.uniform(shape=(N, N))
+        a = mx.matmul(a, a.T) + N * mx.eye(N)
+        b = mx.random.uniform(shape=(N, 1))
+
+        result = mx.linalg.solve(a, b, stream=mx.cpu)
+        expected = np.linalg.solve(a, b)
+        self.assertTrue(np.allclose(result, expected))
+
+        # Test batch dimension
+        a = mx.random.uniform(shape=(5, 5, 4, 4))
+        b = mx.random.uniform(shape=(5, 5, 4, 1))
+
+        result = mx.linalg.solve(a, b, stream=mx.cpu)
+        expected = np.linalg.solve(a, b)
+        self.assertTrue(np.allclose(result, expected, atol=1e-5))
+
+        # Test large matrix
+        N = 1000
+        a = mx.random.uniform(shape=(N, N))
+        b = mx.random.uniform(shape=(N, 1))
+
+        result = mx.linalg.solve(a, b, stream=mx.cpu)
+        expected = np.linalg.solve(a, b)
+        self.assertTrue(np.allclose(result, expected, atol=1e-3))
+
+        # Test multi-column rhs
+        a = mx.random.uniform(shape=(5, 5))
+        b = mx.random.uniform(shape=(5, 8))
+
+        result = mx.linalg.solve(a, b, stream=mx.cpu)
+        expected = np.linalg.solve(a, b)
+        self.assertTrue(np.allclose(result, expected))
+
+        # Test batched multi-column rhs
+        a = mx.broadcast_to(a, (3, 2, 5, 5))
+        b = mx.broadcast_to(b, (3, 1, 5, 8))
+
+        result = mx.linalg.solve(a, b, stream=mx.cpu)
+        expected = np.linalg.solve(a, b)
+        self.assertTrue(np.allclose(result, expected, rtol=1e-5, atol=1e-5))
+
+    def test_solve_triangular(self):
+        # Test lower triangular matrix
+        a = mx.array([[4.0, 0.0, 0.0], [2.0, 3.0, 0.0], [1.0, -2.0, 5.0]])
+        b = mx.array([8.0, 14.0, 3.0])
+
+        result = mx.linalg.solve_triangular(a, b, upper=False, stream=mx.cpu)
+        expected = np.linalg.solve(a, b)
+        self.assertTrue(np.allclose(result, expected))
+
+        # Test upper triangular matrix
+        a = mx.array([[3.0, 2.0, 1.0], [0.0, 5.0, 4.0], [0.0, 0.0, 6.0]])
+        b = mx.array([13.0, 33.0, 18.0])
+
+        result = mx.linalg.solve_triangular(a, b, upper=True, stream=mx.cpu)
+        expected = np.linalg.solve(a, b)
+        self.assertTrue(np.allclose(result, expected))
+
+        # Test batch multi-column rhs
+        a = mx.broadcast_to(a, (3, 4, 3, 3))
+        b = mx.broadcast_to(mx.expand_dims(b, -1), (3, 4, 3, 8))
+
+        result = mx.linalg.solve_triangular(a, b, upper=True, stream=mx.cpu)
+        expected = np.linalg.solve(a, b)
+        self.assertTrue(np.allclose(result, expected))
+

 if __name__ == "__main__":
    unittest.main()