From aac2f9fb61937e2d2e1b11f436cd71c456cfceeb Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Sun, 7 Apr 2024 21:47:43 -0700 Subject: [PATCH] Improve profiling with gpu tracing (#969) * improve profiling with gpu tracing * fix for linux * nit * doc fix * fix example --- docs/src/dev/metal_debugger.rst | 45 +++++++++++++++++++++++---------- docs/src/python/metal.rst | 4 ++- examples/cpp/metal_capture.cpp | 7 ++--- mlx/backend/metal/allocator.cpp | 1 + mlx/backend/metal/device.cpp | 1 + mlx/backend/metal/metal.cpp | 5 ---- mlx/backend/metal/metal.h | 14 +--------- mlx/backend/metal/metal_impl.h | 22 ++++++++++++++++ mlx/backend/metal/utils.h | 3 +++ mlx/backend/no_metal/metal.cpp | 1 + mlx/scheduler.h | 1 + mlx/transforms.cpp | 2 +- python/src/metal.cpp | 21 +++++++++++++++ 13 files changed, 90 insertions(+), 37 deletions(-) create mode 100644 mlx/backend/metal/metal_impl.h diff --git a/docs/src/dev/metal_debugger.rst b/docs/src/dev/metal_debugger.rst index b0d7db9d0..94d25258c 100644 --- a/docs/src/dev/metal_debugger.rst +++ b/docs/src/dev/metal_debugger.rst @@ -1,29 +1,46 @@ Metal Debugger ============== +.. currentmodule:: mlx.core + Profiling is a key step for performance optimization. You can build MLX with -the ``MLX_METAL_DEBUG`` option to improve the Metal debugging and optimization -workflow. The ``MLX_METAL_DEBUG`` debug option: +the ``MLX_METAL_DEBUG`` option to improve the Metal debugging and +optimization workflow. The ``MLX_METAL_DEBUG`` debug option: * Records source during Metal compilation, for later inspection while debugging. * Labels Metal objects such as command queues, improving capture readability. -The ``metal::start_capture`` function initiates a capture of all MLX GPU work. +To build with debugging enabled in Python prepend +``CMAKE_ARGS="-DMLX_METAL_DEBUG=ON"`` to the build call. -.. code-block:: C++ +The :func:`metal.start_capture` function initiates a capture of all MLX GPU +work. - int main() { - metal::start_capture("/Users/Jane/Developer/MLX.gputrace"); +.. note:: - auto a = arange(10.f, 20.f, 1.f, float32); - auto b = arange(30.f, 40.f, 1.f, float32); - auto c = add(a, b); + To capture a GPU trace you must run the application with + ``MTL_CAPTURE_ENABLED=1``. - eval(c); +.. code-block:: python - metal::stop_capture(); - } + import mlx.core as mx + + a = mx.random.uniform(shape=(512, 512)) + b = mx.random.uniform(shape=(512, 512)) + mx.eval(a, b) + + trace_file = "mlx_trace.gputrace" + + if not mx.metal.start_capture(trace_file): + print("Make sure to run with MTL_CAPTURE_ENABLED=1 and " + f"that the path {trace_file} does not already exist.") + exit(1) + + for _ in range(10): + mx.eval(mx.add(a, b)) + + mx.metal.stop_capture() You can open and replay the GPU trace in Xcode. The ``Dependencies`` view has a great overview of all operations. Checkout the `Metal debugger @@ -35,8 +52,8 @@ documentation`_ for more information. Xcode Workflow -------------- -You can skip saving to a path by running within Xcode. First, generate an Xcode -project using CMake. +You can skip saving to a path by running within Xcode. First, generate an +Xcode project using CMake. .. code-block:: diff --git a/docs/src/python/metal.rst b/docs/src/python/metal.rst index c11deb4fa..c92b18936 100644 --- a/docs/src/python/metal.rst +++ b/docs/src/python/metal.rst @@ -3,7 +3,7 @@ Metal .. currentmodule:: mlx.core.metal -.. autosummary:: +.. autosummary:: :toctree: _autosummary is_available @@ -12,3 +12,5 @@ Metal get_cache_memory set_memory_limit set_cache_limit + start_capture + stop_capture diff --git a/examples/cpp/metal_capture.cpp b/examples/cpp/metal_capture.cpp index db5514786..1033b614b 100644 --- a/examples/cpp/metal_capture.cpp +++ b/examples/cpp/metal_capture.cpp @@ -8,9 +8,10 @@ using namespace mlx::core; int main() { - // Enable the MLX_METAL_DEBUG CMake option to enhance the capture with groups, - // labels, etc. - assert(metal::start_capture()); + // To use Metal debugging and profiling: + // 1. Build with the MLX_METAL_DEBUG CMake option (i.e. -DMLX_METAL_DEBUG=ON). + // 2. Run with MTL_CAPTURE_ENABLED=1. + assert(metal::start_capture("mlx_trace.gputrace")); // Start at index two because the default GPU and CPU streams have indices // zero and one, respectively. This naming matches the label assigned to each diff --git a/mlx/backend/metal/allocator.cpp b/mlx/backend/metal/allocator.cpp index 286388003..0e7502744 100644 --- a/mlx/backend/metal/allocator.cpp +++ b/mlx/backend/metal/allocator.cpp @@ -1,6 +1,7 @@ // Copyright © 2023-2024 Apple Inc. #include "mlx/backend/metal/allocator.h" #include "mlx/backend/metal/metal.h" +#include "mlx/backend/metal/metal_impl.h" #include #include diff --git a/mlx/backend/metal/device.cpp b/mlx/backend/metal/device.cpp index 21b2930bc..c814b70b9 100644 --- a/mlx/backend/metal/device.cpp +++ b/mlx/backend/metal/device.cpp @@ -11,6 +11,7 @@ #include "mlx/backend/metal/device.h" #include "mlx/backend/metal/metal.h" +#include "mlx/backend/metal/metal_impl.h" #include "mlx/backend/metal/mps/gemm.h" #include "mlx/backend/metal/utils.h" diff --git a/mlx/backend/metal/metal.cpp b/mlx/backend/metal/metal.cpp index 07cb4e900..f57498036 100644 --- a/mlx/backend/metal/metal.cpp +++ b/mlx/backend/metal/metal.cpp @@ -1,5 +1,4 @@ // Copyright © 2023-2024 Apple Inc. - #include #include #include @@ -16,9 +15,6 @@ bool is_available() { } int max_ops_per_buffer() { -#ifdef MLX_METAL_DEBUG - return 1; -#else auto get_val = []() { if (const char* buff_str = std::getenv("MLX_MAX_OPS_PER_BUFFER")) { return atoi(buff_str); @@ -28,7 +24,6 @@ int max_ops_per_buffer() { }; static int max_ops_per_buffer_ = get_val(); return max_ops_per_buffer_; -#endif } #define MAX_OPS_PER_BUFFER max_ops_per_buffer() diff --git a/mlx/backend/metal/metal.h b/mlx/backend/metal/metal.h index ffbfe0ed0..fd417b3d7 100644 --- a/mlx/backend/metal/metal.h +++ b/mlx/backend/metal/metal.h @@ -2,15 +2,11 @@ #pragma once -#include -#include -#include - #include "mlx/array.h" -#include "mlx/stream.h" namespace mlx::core::metal { +/* Check if the Metal backend is available. */ bool is_available(); /* Get the actively used memory in bytes. @@ -58,14 +54,6 @@ size_t set_memory_limit(size_t limit, bool relaxed = true); * */ size_t set_cache_limit(size_t limit); -void new_stream(Stream stream); -std::shared_ptr new_scoped_memory_pool(); - -std::function make_task( - array& arr, - std::vector> deps, - std::shared_ptr> p); - /** Capture a GPU trace, saving it to an absolute file `path` */ bool start_capture(std::string path = ""); void stop_capture(); diff --git a/mlx/backend/metal/metal_impl.h b/mlx/backend/metal/metal_impl.h new file mode 100644 index 000000000..3487558b8 --- /dev/null +++ b/mlx/backend/metal/metal_impl.h @@ -0,0 +1,22 @@ +// Copyright © 2023-2024 Apple Inc. + +#pragma once + +#include +#include +#include + +#include "mlx/array.h" +#include "mlx/stream.h" + +namespace mlx::core::metal { + +void new_stream(Stream stream); +std::shared_ptr new_scoped_memory_pool(); + +std::function make_task( + array& arr, + std::vector> deps, + std::shared_ptr> p); + +} // namespace mlx::core::metal diff --git a/mlx/backend/metal/utils.h b/mlx/backend/metal/utils.h index a73571914..1b90aa6c8 100644 --- a/mlx/backend/metal/utils.h +++ b/mlx/backend/metal/utils.h @@ -142,6 +142,9 @@ inline void debug_set_primitive_buffer_label( Primitive& primitive) { #ifdef MLX_METAL_DEBUG std::ostringstream label; + if (auto cbuf_label = command_buffer->label(); cbuf_label) { + label << cbuf_label->utf8String(); + } primitive.print(label); command_buffer->setLabel(make_string(label)); #endif diff --git a/mlx/backend/no_metal/metal.cpp b/mlx/backend/no_metal/metal.cpp index 01def113a..fe177a467 100644 --- a/mlx/backend/no_metal/metal.cpp +++ b/mlx/backend/no_metal/metal.cpp @@ -3,6 +3,7 @@ #include #include "mlx/backend/metal/metal.h" +#include "mlx/backend/metal/metal_impl.h" namespace mlx::core::metal { diff --git a/mlx/scheduler.h b/mlx/scheduler.h index 755efeaf1..f50a8c310 100644 --- a/mlx/scheduler.h +++ b/mlx/scheduler.h @@ -9,6 +9,7 @@ #include #include "mlx/backend/metal/metal.h" +#include "mlx/backend/metal/metal_impl.h" #include "mlx/device.h" #include "mlx/stream.h" diff --git a/mlx/transforms.cpp b/mlx/transforms.cpp index e66310ee8..8a07bfd6a 100644 --- a/mlx/transforms.cpp +++ b/mlx/transforms.cpp @@ -7,7 +7,7 @@ #include #include -#include "mlx/backend/metal/metal.h" +#include "mlx/backend/metal/metal_impl.h" #include "mlx/ops.h" #include "mlx/primitives.h" #include "mlx/scheduler.h" diff --git a/python/src/metal.cpp b/python/src/metal.cpp index bec29d3aa..53e14a228 100644 --- a/python/src/metal.cpp +++ b/python/src/metal.cpp @@ -2,6 +2,7 @@ #include "mlx/backend/metal/metal.h" #include +#include namespace nb = nanobind; using namespace nb::literals; @@ -88,4 +89,24 @@ void init_metal(nb::module_& m) { Returns: int: The previous cache limit in bytes. )pbdoc"); + metal.def( + "start_capture", + &metal::start_capture, + "path"_a, + R"pbdoc( + Start a Metal capture. + + Args: + path (str): The path to save the capture which should have + the extension ``.gputrace``. + + Returns: + bool: Whether the capture was successfully started. + )pbdoc"); + metal.def( + "stop_capture", + &metal::stop_capture, + R"pbdoc( + Stop a Metal capture. + )pbdoc"); }