CUDA backend: matmul (#2241)

2025-12-16 01:49:05 +08:00 · 2025-06-07 04:24:04 +09:00
parent c6a20b427a
commit 24f89173d1
7 changed files with 584 additions and 67 deletions
--- a/mlx/backend/metal/matmul.cpp
+++ b/mlx/backend/metal/matmul.cpp
@@ -6,7 +6,7 @@
 #include <sstream>

 #include "mlx/backend/common/broadcasting.h"
-#include "mlx/backend/common/utils.h"
+#include "mlx/backend/common/matmul.h"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/kernels.h"
@@ -21,69 +21,6 @@ namespace mlx::core {

 namespace {

-inline auto collapse_batches(const array& a, const array& b) {
-  // Get and check the shape for the batched dims
-  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
-  Shape B_bshape{b.shape().begin(), b.shape().end() - 2};
-  if (A_bshape != B_bshape) {
-    std::ostringstream msg;
-    msg << "[matmul] Got matrices with incorrectly broadcasted shapes: " << "A "
-        << a.shape() << ", B " << b.shape() << ".";
-    throw std::runtime_error(msg.str());
-  }
-
-  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
-  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
-
-  auto [batch_shape, batch_strides] =
-      collapse_contiguous_dims(A_bshape, std::vector{A_bstride, B_bstride});
-
-  auto A_batch_stride = batch_strides[0];
-  auto B_batch_stride = batch_strides[1];
-
-  if (batch_shape.empty()) {
-    batch_shape.push_back(1);
-    A_batch_stride.push_back(0);
-    B_batch_stride.push_back(0);
-  }
-
-  return std::make_tuple(batch_shape, A_batch_stride, B_batch_stride);
-}
-
-inline auto collapse_batches(const array& a, const array& b, const array& c) {
-  // Get and check the shape for the batched dims
-  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
-  Shape B_bshape{b.shape().begin(), b.shape().end() - 2};
-  Shape C_bshape{c.shape().begin(), c.shape().end() - 2};
-  if (A_bshape != B_bshape || A_bshape != C_bshape) {
-    std::ostringstream msg;
-    msg << "[addmm] Got matrices with incorrectly broadcasted shapes: " << "A "
-        << a.shape() << ", B " << b.shape() << ", B " << c.shape() << ".";
-    throw std::runtime_error(msg.str());
-  }
-
-  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
-  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
-  Strides C_bstride{c.strides().begin(), c.strides().end() - 2};
-
-  auto [batch_shape, batch_strides] = collapse_contiguous_dims(
-      A_bshape, std::vector{A_bstride, B_bstride, C_bstride});
-
-  auto A_batch_stride = batch_strides[0];
-  auto B_batch_stride = batch_strides[1];
-  auto C_batch_stride = batch_strides[2];
-
-  if (batch_shape.empty()) {
-    batch_shape.push_back(1);
-    A_batch_stride.push_back(0);
-    B_batch_stride.push_back(0);
-    C_batch_stride.push_back(0);
-  }
-
-  return std::make_tuple(
-      batch_shape, A_batch_stride, B_batch_stride, C_batch_stride);
-}
-
 std::tuple<bool, int64_t, array> check_transpose(
    std::vector<array>& copies,
    const Stream& s,