Add io device and cpu::make_task

2025-12-16 01:49:05 +08:00 · 2024-05-01 17:04:52 -07:00
parent 9814a2ae12
commit be36f136de
9 changed files with 111 additions and 29 deletions
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -55,6 +55,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_impl.cpp
  ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
 )
--- a/mlx/backend/common/cpu_impl.cpp
+++ b/mlx/backend/common/cpu_impl.cpp
@@ -0,0 +1,48 @@
 // Copyright © 2023-2024 Apple Inc.
 #include "mlx/backend/common/cpu_impl.h"
 #include "mlx/primitives.h"
 #include "mlx/scheduler.h"
 namespace mlx::core::cpu {
 std::function<void()> make_task(array arr, bool signal) {
  return [arr = std::move(arr), signal]() mutable {
    auto stream = arr.primitive().stream();
    // Wait on inputs coming from different streams/devices.
    for (auto& input : arr.inputs()) {
      if (input.event().valid() &&
          input.event().stream() != arr.primitive().stream()) {
        input.event().wait();
      }
    }
    // Task computation actually starting.
    scheduler::notify_new_task(stream);
    // Perform the computation
    auto outputs = arr.outputs();
    arr.primitive().eval_cpu(arr.inputs(), outputs);
    // Check if we need to detach and signal other arrays waiting for the
    // result to be ready.
    if (!arr.is_tracer()) {
      arr.detach();
    }
    if (signal) {
      arr.event().signal();
    }
    // Task computation done.
    scheduler::notify_task_completion(stream);
  };
 }
 std::function<void()> make_synchronize_task(
    Stream s,
    std::shared_ptr<std::promise<void>> p) {
  return [p = std::move(p)]() { p->set_value(); };
 }
 } // namespace mlx::core::cpu
--- a/mlx/backend/common/cpu_impl.h
+++ b/mlx/backend/common/cpu_impl.h
@@ -0,0 +1,18 @@
 // Copyright © 2024 Apple Inc.
 #pragma once
 #include <functional>
 #include <future>
 #include <memory>
 #include "mlx/array.h"
 namespace mlx::core::cpu {
 std::function<void()> make_task(array arr, bool signal);
 std::function<void()> make_synchronize_task(
    Stream s,
    std::shared_ptr<std::promise<void>> p);
 } // namespace mlx::core::cpu
--- a/mlx/device.h
+++ b/mlx/device.h
@@ -8,10 +8,12 @@ struct Device {
  enum class DeviceType {
    cpu,
    gpu,
    io,
  };
  static constexpr DeviceType cpu = DeviceType::cpu;
  static constexpr DeviceType gpu = DeviceType::gpu;
  static constexpr DeviceType io = DeviceType::io;
  Device(DeviceType type, int index = 0) : type(type), index(index) {};
--- a/mlx/primitives.cpp
+++ b/mlx/primitives.cpp
@@ -106,6 +106,16 @@ std::tuple<array, array, array, int> vmap_ternary_op(
 } // namespace
 void Primitive::eval_io(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  std::ostringstream msg;
  msg << "[Primitive::eval_io] Not implemented for ";
  print(msg);
  msg << ".";
  throw std::invalid_argument(msg.str());
 }
 std::vector<array> Primitive::jvp(
    const std::vector<array>&,
    const std::vector<array>&,
--- a/mlx/primitives.h
+++ b/mlx/primitives.h
@@ -73,6 +73,16 @@ class Primitive {
      const std::vector<array>& inputs,
      std::vector<array>& outputs) = 0;
  /**
   * Some primitives are computed by an IO device (disk, network, camera etc).
   *
   * Like in eval_cpu/gpu the eval_io function is responsible for allocating
   * the space for the array.
   */
  virtual void eval_io(
      const std::vector<array>& inputs,
      std::vector<array>& outputs);
  /**
   * The Jacobian-vector product.
   */
--- a/mlx/scheduler.cpp
+++ b/mlx/scheduler.cpp
@@ -1,6 +1,7 @@
 // Copyright © 2023 Apple Inc.
 #include "mlx/scheduler.h"
 #include "mlx/backend/common/cpu_impl.h"
 #include "mlx/backend/metal/metal.h"
 namespace mlx::core {
@@ -36,10 +37,13 @@ Stream new_stream() {
 void synchronize(Stream s) {
  auto p = std::make_shared<std::promise<void>>();
  std::future<void> f = p->get_future();
-  if (s.device == mlx::core::Device::cpu) {
+  switch (s.device.type) {
-    scheduler::enqueue(s, [p = std::move(p)]() { p->set_value(); });
+    case mlx::core::Device::cpu:
-  } else {
+      scheduler::enqueue(s, cpu::make_synchronize_task(s, std::move(p)));
-    scheduler::enqueue(s, metal::make_synchronize_task(s, std::move(p)));
+      break;
    case mlx::core::Device::gpu:
      scheduler::enqueue(s, metal::make_synchronize_task(s, std::move(p)));
      break;
  }
  f.wait();
 }
--- a/mlx/transforms.cpp
+++ b/mlx/transforms.cpp
@@ -8,6 +8,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include "mlx/backend/common/cpu_impl.h"
 #include "mlx/backend/metal/metal_impl.h"
 #include "mlx/ops.h"
 #include "mlx/primitives.h"
@@ -137,32 +138,17 @@ array eval_impl(std::vector<array> outputs, bool async) {
    std::vector<std::shared_future<void>> arr_deps;
    bool signal = needs_signal.find(arr.id()) != needs_signal.end();
-    if (arr.primitive().device() == Device::gpu) {
+    switch (arr.primitive().device().type) {
-      if (!metal::is_available()) {
+      case Device::gpu: {
-        throw std::runtime_error("Metal GPU is not available.");
+        if (!metal::is_available()) {
          throw std::runtime_error("Metal GPU is not available.");
        }
        scheduler::enqueue(stream, metal::make_task(std::move(arr), signal));
        break;
      }
-      scheduler::enqueue(stream, metal::make_task(std::move(arr), signal));
+      case Device::cpu:
-    } else {
+        scheduler::enqueue(stream, cpu::make_task(std::move(arr), signal));
-      auto task = [arr = std::move(arr), stream, signal]() mutable {
+        break;
        for (auto& input : arr.inputs()) {
          if (input.event().valid() &&
              input.event().stream() != arr.primitive().stream()) {
            input.event().wait();
          }
        }
        scheduler::notify_new_task(stream);
        auto outputs = arr.outputs();
        arr.primitive().eval_cpu(arr.inputs(), outputs);
        if (!arr.is_tracer()) {
          arr.detach();
        }
        if (signal) {
          arr.event().signal();
        }
        scheduler::notify_task_completion(stream);
      };
      scheduler::enqueue(stream, std::move(task));
    }
  }
  return synchronizer;
--- a/mlx/utils.cpp
+++ b/mlx/utils.cpp
@@ -133,6 +133,9 @@ std::ostream& operator<<(std::ostream& os, const Device& d) {
    case Device::gpu:
      os << "gpu";
      break;
    case Device::io:
      os << "io";
      break;
  }
  os << ", " << d.index << ")";
  return os;