From aac2f9fb61937e2d2e1b11f436cd71c456cfceeb Mon Sep 17 00:00:00 2001
From: Awni Hannun <awni@apple.com>
Date: Sun, 7 Apr 2024 21:47:43 -0700
Subject: [PATCH] Improve profiling with gpu tracing (#969)

* improve profiling with gpu tracing

* fix for linux

* nit

* doc fix

* fix example
---
 docs/src/dev/metal_debugger.rst | 45 +++++++++++++++++++++++----------
 docs/src/python/metal.rst       |  4 ++-
 examples/cpp/metal_capture.cpp  |  7 ++---
 mlx/backend/metal/allocator.cpp |  1 +
 mlx/backend/metal/device.cpp    |  1 +
 mlx/backend/metal/metal.cpp     |  5 ----
 mlx/backend/metal/metal.h       | 14 +---------
 mlx/backend/metal/metal_impl.h  | 22 ++++++++++++++++
 mlx/backend/metal/utils.h       |  3 +++
 mlx/backend/no_metal/metal.cpp  |  1 +
 mlx/scheduler.h                 |  1 +
 mlx/transforms.cpp              |  2 +-
 python/src/metal.cpp            | 21 +++++++++++++++
 13 files changed, 90 insertions(+), 37 deletions(-)
 create mode 100644 mlx/backend/metal/metal_impl.h

diff --git a/docs/src/dev/metal_debugger.rst b/docs/src/dev/metal_debugger.rst
index b0d7db9d0..94d25258c 100644
--- a/docs/src/dev/metal_debugger.rst
+++ b/docs/src/dev/metal_debugger.rst
@@ -1,29 +1,46 @@
 Metal Debugger
 ==============
 
+.. currentmodule:: mlx.core
+
 Profiling is a key step for performance optimization. You can build MLX with
-the ``MLX_METAL_DEBUG`` option to improve the Metal debugging and optimization
-workflow. The ``MLX_METAL_DEBUG`` debug option:
+the ``MLX_METAL_DEBUG`` option to improve the Metal debugging and
+optimization workflow. The ``MLX_METAL_DEBUG`` debug option:
 
 * Records source during Metal compilation, for later inspection while
   debugging.
 * Labels Metal objects such as command queues, improving capture readability.
 
-The ``metal::start_capture`` function initiates a capture of all MLX GPU work.
+To build with debugging enabled in Python prepend
+``CMAKE_ARGS="-DMLX_METAL_DEBUG=ON"`` to the build call.
 
-.. code-block:: C++
+The :func:`metal.start_capture` function initiates a capture of all MLX GPU
+work.
 
-    int main() {
-        metal::start_capture("/Users/Jane/Developer/MLX.gputrace");
+.. note::
 
-        auto a = arange(10.f, 20.f, 1.f, float32);
-        auto b = arange(30.f, 40.f, 1.f, float32);
-        auto c = add(a, b);
+   To capture a GPU trace you must run the application with
+   ``MTL_CAPTURE_ENABLED=1``.
 
-        eval(c);
+.. code-block:: python
 
-        metal::stop_capture();
-    }
+    import mlx.core as mx
+
+    a = mx.random.uniform(shape=(512, 512))
+    b = mx.random.uniform(shape=(512, 512))
+    mx.eval(a, b)
+
+    trace_file = "mlx_trace.gputrace"
+
+    if not mx.metal.start_capture(trace_file):
+      print("Make sure to run with MTL_CAPTURE_ENABLED=1 and "
+            f"that the path {trace_file} does not already exist.")
+      exit(1)
+
+    for _ in range(10):
+      mx.eval(mx.add(a, b))
+
+    mx.metal.stop_capture()
 
 You can open and replay the GPU trace in Xcode. The ``Dependencies`` view
 has a great overview of all operations. Checkout the `Metal debugger
@@ -35,8 +52,8 @@ documentation`_ for more information.
 Xcode Workflow
 --------------
 
-You can skip saving to a path by running within Xcode. First, generate an Xcode
-project using CMake.
+You can skip saving to a path by running within Xcode. First, generate an
+Xcode project using CMake.
 
 .. code-block::
 
diff --git a/docs/src/python/metal.rst b/docs/src/python/metal.rst
index c11deb4fa..c92b18936 100644
--- a/docs/src/python/metal.rst
+++ b/docs/src/python/metal.rst
@@ -3,7 +3,7 @@ Metal
 
 .. currentmodule:: mlx.core.metal
 
-.. autosummary:: 
+.. autosummary::
   :toctree: _autosummary
 
   is_available
@@ -12,3 +12,5 @@ Metal
   get_cache_memory
   set_memory_limit
   set_cache_limit
+  start_capture
+  stop_capture
diff --git a/examples/cpp/metal_capture.cpp b/examples/cpp/metal_capture.cpp
index db5514786..1033b614b 100644
--- a/examples/cpp/metal_capture.cpp
+++ b/examples/cpp/metal_capture.cpp
@@ -8,9 +8,10 @@
 using namespace mlx::core;
 
 int main() {
-  // Enable the MLX_METAL_DEBUG CMake option to enhance the capture with groups,
-  // labels, etc.
-  assert(metal::start_capture());
+  // To use Metal debugging and profiling:
+  // 1. Build with the MLX_METAL_DEBUG CMake option (i.e. -DMLX_METAL_DEBUG=ON).
+  // 2. Run with MTL_CAPTURE_ENABLED=1.
+  assert(metal::start_capture("mlx_trace.gputrace"));
 
   // Start at index two because the default GPU and CPU streams have indices
   // zero and one, respectively. This naming matches the label assigned to each
diff --git a/mlx/backend/metal/allocator.cpp b/mlx/backend/metal/allocator.cpp
index 286388003..0e7502744 100644
--- a/mlx/backend/metal/allocator.cpp
+++ b/mlx/backend/metal/allocator.cpp
@@ -1,6 +1,7 @@
 // Copyright © 2023-2024 Apple Inc.
 #include "mlx/backend/metal/allocator.h"
 #include "mlx/backend/metal/metal.h"
+#include "mlx/backend/metal/metal_impl.h"
 
 #include <mach/vm_page_size.h>
 #include <unistd.h>
diff --git a/mlx/backend/metal/device.cpp b/mlx/backend/metal/device.cpp
index 21b2930bc..c814b70b9 100644
--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -11,6 +11,7 @@
 
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/metal.h"
+#include "mlx/backend/metal/metal_impl.h"
 #include "mlx/backend/metal/mps/gemm.h"
 #include "mlx/backend/metal/utils.h"
 
diff --git a/mlx/backend/metal/metal.cpp b/mlx/backend/metal/metal.cpp
index 07cb4e900..f57498036 100644
--- a/mlx/backend/metal/metal.cpp
+++ b/mlx/backend/metal/metal.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2023-2024 Apple Inc.
-
 #include <cstdlib>
 #include <future>
 #include <memory>
@@ -16,9 +15,6 @@ bool is_available() {
 }
 
 int max_ops_per_buffer() {
-#ifdef MLX_METAL_DEBUG
-  return 1;
-#else
   auto get_val = []() {
     if (const char* buff_str = std::getenv("MLX_MAX_OPS_PER_BUFFER")) {
       return atoi(buff_str);
@@ -28,7 +24,6 @@ int max_ops_per_buffer() {
   };
   static int max_ops_per_buffer_ = get_val();
   return max_ops_per_buffer_;
-#endif
 }
 
 #define MAX_OPS_PER_BUFFER max_ops_per_buffer()
diff --git a/mlx/backend/metal/metal.h b/mlx/backend/metal/metal.h
index ffbfe0ed0..fd417b3d7 100644
--- a/mlx/backend/metal/metal.h
+++ b/mlx/backend/metal/metal.h
@@ -2,15 +2,11 @@
 
 #pragma once
 
-#include <future>
-#include <memory>
-#include <vector>
-
 #include "mlx/array.h"
-#include "mlx/stream.h"
 
 namespace mlx::core::metal {
 
+/* Check if the Metal backend is available. */
 bool is_available();
 
 /* Get the actively used memory in bytes.
@@ -58,14 +54,6 @@ size_t set_memory_limit(size_t limit, bool relaxed = true);
  * */
 size_t set_cache_limit(size_t limit);
 
-void new_stream(Stream stream);
-std::shared_ptr<void> new_scoped_memory_pool();
-
-std::function<void()> make_task(
-    array& arr,
-    std::vector<std::shared_future<void>> deps,
-    std::shared_ptr<std::promise<void>> p);
-
 /** Capture a GPU trace, saving it to an absolute file `path` */
 bool start_capture(std::string path = "");
 void stop_capture();
diff --git a/mlx/backend/metal/metal_impl.h b/mlx/backend/metal/metal_impl.h
new file mode 100644
index 000000000..3487558b8
--- /dev/null
+++ b/mlx/backend/metal/metal_impl.h
@@ -0,0 +1,22 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#pragma once
+
+#include <future>
+#include <memory>
+#include <vector>
+
+#include "mlx/array.h"
+#include "mlx/stream.h"
+
+namespace mlx::core::metal {
+
+void new_stream(Stream stream);
+std::shared_ptr<void> new_scoped_memory_pool();
+
+std::function<void()> make_task(
+    array& arr,
+    std::vector<std::shared_future<void>> deps,
+    std::shared_ptr<std::promise<void>> p);
+
+} // namespace mlx::core::metal
diff --git a/mlx/backend/metal/utils.h b/mlx/backend/metal/utils.h
index a73571914..1b90aa6c8 100644
--- a/mlx/backend/metal/utils.h
+++ b/mlx/backend/metal/utils.h
@@ -142,6 +142,9 @@ inline void debug_set_primitive_buffer_label(
     Primitive& primitive) {
 #ifdef MLX_METAL_DEBUG
   std::ostringstream label;
+  if (auto cbuf_label = command_buffer->label(); cbuf_label) {
+    label << cbuf_label->utf8String();
+  }
   primitive.print(label);
   command_buffer->setLabel(make_string(label));
 #endif
diff --git a/mlx/backend/no_metal/metal.cpp b/mlx/backend/no_metal/metal.cpp
index 01def113a..fe177a467 100644
--- a/mlx/backend/no_metal/metal.cpp
+++ b/mlx/backend/no_metal/metal.cpp
@@ -3,6 +3,7 @@
 #include <stdexcept>
 
 #include "mlx/backend/metal/metal.h"
+#include "mlx/backend/metal/metal_impl.h"
 
 namespace mlx::core::metal {
 
diff --git a/mlx/scheduler.h b/mlx/scheduler.h
index 755efeaf1..f50a8c310 100644
--- a/mlx/scheduler.h
+++ b/mlx/scheduler.h
@@ -9,6 +9,7 @@
 #include <unordered_map>
 
 #include "mlx/backend/metal/metal.h"
+#include "mlx/backend/metal/metal_impl.h"
 #include "mlx/device.h"
 #include "mlx/stream.h"
 
diff --git a/mlx/transforms.cpp b/mlx/transforms.cpp
index e66310ee8..8a07bfd6a 100644
--- a/mlx/transforms.cpp
+++ b/mlx/transforms.cpp
@@ -7,7 +7,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "mlx/backend/metal/metal.h"
+#include "mlx/backend/metal/metal_impl.h"
 #include "mlx/ops.h"
 #include "mlx/primitives.h"
 #include "mlx/scheduler.h"
diff --git a/python/src/metal.cpp b/python/src/metal.cpp
index bec29d3aa..53e14a228 100644
--- a/python/src/metal.cpp
+++ b/python/src/metal.cpp
@@ -2,6 +2,7 @@
 
 #include "mlx/backend/metal/metal.h"
 #include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
 
 namespace nb = nanobind;
 using namespace nb::literals;
@@ -88,4 +89,24 @@ void init_metal(nb::module_& m) {
       Returns:
         int: The previous cache limit in bytes.
       )pbdoc");
+  metal.def(
+      "start_capture",
+      &metal::start_capture,
+      "path"_a,
+      R"pbdoc(
+      Start a Metal capture.
+
+      Args:
+        path (str): The path to save the capture which should have
+          the extension ``.gputrace``.
+
+      Returns:
+        bool: Whether the capture was successfully started.
+      )pbdoc");
+  metal.def(
+      "stop_capture",
+      &metal::stop_capture,
+      R"pbdoc(
+      Stop a Metal capture.
+      )pbdoc");
 }