Add cpp op and primitive for tunable matmul

2025-12-16 01:49:05 +08:00 · 2024-10-29 13:54:16 -07:00
parent 84e7c49f08
commit 2ed2e0e3da
3 changed files with 139 additions and 0 deletions
--- a/mlx/internal/tuner/ops.cpp
+++ b/mlx/internal/tuner/ops.cpp
@@ -0,0 +1,88 @@
 // Copyright © 2024 Apple Inc.
 #include "mlx/internal/tuner/ops.h"
 #include "mlx/internal/tuner/primitives.h"
 #include "mlx/ops.h"
 namespace mlx::core::internal {
 array tunable_matmul(
    const array& in_a,
    const array& in_b,
    const std::unordered_map<std::string, int>& tparams,
    StreamOrDevice s_ /*= {} */) {
  auto s = to_stream(s_);
  auto fallback = [s](const std::vector<array>& inputs) {
    return std::vector<array>{matmul(inputs[0], inputs[1], s)};
  };
  if (s.device == Device::cpu || in_a.ndim() < 2 || in_b.ndim() < 2) {
    return matmul(in_a, in_b, s);
  }
  auto a = in_a;
  auto b = in_b;
  if (a.shape(-1) != b.shape(-2)) {
    std::ostringstream msg;
    msg << "[matmul] Last dimension of first input with shape " << a.shape()
        << " must match second to last dimension of"
        << " second input with shape " << b.shape() << ".";
    throw std::invalid_argument(msg.str());
  }
  auto out_type = promote_types(a.dtype(), b.dtype());
  if (!issubdtype(out_type, floating)) {
    std::ostringstream msg;
    msg << "[matmul] Only real floating point types are supported but "
        << a.dtype() << " and " << b.dtype() << " were provided which results"
        << " in " << out_type << ", which is not a real floating point type.";
    throw std::invalid_argument(msg.str());
  }
  a = astype(a, out_type, s);
  b = astype(b, out_type, s);
  // We can batch the multiplication by reshaping a
  if (a.ndim() > 2 && b.ndim() == 2) {
    std::vector<int> out_shape = a.shape();
    a = reshape(a, {-1, out_shape.back()}, s);
    out_shape.back() = b.shape(-1);
    if (in_b.ndim() == 1) {
      out_shape.pop_back();
    }
    auto out = array(
        {a.shape(0), b.shape(1)},
        out_type,
        std::make_shared<TunableMatmul>(to_stream(s), fallback, tparams),
        {a, b});
    return reshape(out, out_shape, s);
  }
  if (a.ndim() > 2 || b.ndim() > 2) {
    std::vector<int> bsx_a(a.shape().begin(), a.shape().end() - 2);
    std::vector<int> bsx_b(b.shape().begin(), b.shape().end() - 2);
    auto inner_shape = broadcast_shapes(bsx_a, bsx_b);
    // Broadcast a
    inner_shape.push_back(a.shape(-2));
    inner_shape.push_back(a.shape(-1));
    a = broadcast_to(a, inner_shape, s);
    // Broadcast b
    *(inner_shape.end() - 2) = b.shape(-2);
    *(inner_shape.end() - 1) = b.shape(-1);
    b = broadcast_to(b, inner_shape, s);
  }
  auto out_shape = a.shape();
  out_shape.back() = b.shape(-1);
  return array(
      std::move(out_shape),
      out_type,
      std::make_shared<TunableMatmul>(to_stream(s), fallback, tparams),
      {a, b});
 }
 } // namespace mlx::core::internal
--- a/mlx/internal/tuner/ops.h
+++ b/mlx/internal/tuner/ops.h
@@ -0,0 +1,17 @@
 // Copyright © 2024 Apple Inc.
 #pragma once
 #include <unordered_map>
 #include "mlx/utils.h"
 namespace mlx::core::internal {
 array tunable_matmul(
    const array& a,
    const array& b,
    const std::unordered_map<std::string, int>& tparams,
    StreamOrDevice s = {});
 } // namespace mlx::core::internal
--- a/mlx/internal/tuner/primitives.h
+++ b/mlx/internal/tuner/primitives.h
@@ -0,0 +1,34 @@
 // Copyright © 2024 Apple Inc.
 #pragma once
 #include <unordered_map>
 #include "mlx/fast_primitives.h"
 #include "mlx/primitives.h"
 namespace mlx::core::internal {
 class TunableMatmul : public mlx::core::fast::Custom {
 public:
  TunableMatmul(
      Stream stream,
      std::function<std::vector<array>(std::vector<array>)> fallback,
      std::unordered_map<std::string, int> tparams)
      : mlx::core::fast::Custom(stream, fallback), tparams_(tparams) {}
  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override {
    throw std::runtime_error("NYI");
  }
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  DEFINE_PRINT(TunableMatmul)
 private:
  std::function<std::vector<array>(std::vector<array>)> fallback_;
  std::unordered_map<std::string, int> tparams_;
 };
 } // namespace mlx::core::internal