Improve profiling with gpu tracing (#969)

* improve profiling with gpu tracing

* fix for linux

* nit

* doc fix

* fix example
This commit is contained in:
Awni Hannun 2024-04-07 21:47:43 -07:00 committed by GitHub
parent bddf23f175
commit aac2f9fb61
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 90 additions and 37 deletions

View File

@ -1,29 +1,46 @@
Metal Debugger Metal Debugger
============== ==============
.. currentmodule:: mlx.core
Profiling is a key step for performance optimization. You can build MLX with Profiling is a key step for performance optimization. You can build MLX with
the ``MLX_METAL_DEBUG`` option to improve the Metal debugging and optimization the ``MLX_METAL_DEBUG`` option to improve the Metal debugging and
workflow. The ``MLX_METAL_DEBUG`` debug option: optimization workflow. The ``MLX_METAL_DEBUG`` debug option:
* Records source during Metal compilation, for later inspection while * Records source during Metal compilation, for later inspection while
debugging. debugging.
* Labels Metal objects such as command queues, improving capture readability. * Labels Metal objects such as command queues, improving capture readability.
The ``metal::start_capture`` function initiates a capture of all MLX GPU work. To build with debugging enabled in Python prepend
``CMAKE_ARGS="-DMLX_METAL_DEBUG=ON"`` to the build call.
.. code-block:: C++ The :func:`metal.start_capture` function initiates a capture of all MLX GPU
work.
int main() { .. note::
metal::start_capture("/Users/Jane/Developer/MLX.gputrace");
auto a = arange(10.f, 20.f, 1.f, float32); To capture a GPU trace you must run the application with
auto b = arange(30.f, 40.f, 1.f, float32); ``MTL_CAPTURE_ENABLED=1``.
auto c = add(a, b);
eval(c); .. code-block:: python
metal::stop_capture(); import mlx.core as mx
}
a = mx.random.uniform(shape=(512, 512))
b = mx.random.uniform(shape=(512, 512))
mx.eval(a, b)
trace_file = "mlx_trace.gputrace"
if not mx.metal.start_capture(trace_file):
print("Make sure to run with MTL_CAPTURE_ENABLED=1 and "
f"that the path {trace_file} does not already exist.")
exit(1)
for _ in range(10):
mx.eval(mx.add(a, b))
mx.metal.stop_capture()
You can open and replay the GPU trace in Xcode. The ``Dependencies`` view You can open and replay the GPU trace in Xcode. The ``Dependencies`` view
has a great overview of all operations. Checkout the `Metal debugger has a great overview of all operations. Checkout the `Metal debugger
@ -35,8 +52,8 @@ documentation`_ for more information.
Xcode Workflow Xcode Workflow
-------------- --------------
You can skip saving to a path by running within Xcode. First, generate an Xcode You can skip saving to a path by running within Xcode. First, generate an
project using CMake. Xcode project using CMake.
.. code-block:: .. code-block::

View File

@ -12,3 +12,5 @@ Metal
get_cache_memory get_cache_memory
set_memory_limit set_memory_limit
set_cache_limit set_cache_limit
start_capture
stop_capture

View File

@ -8,9 +8,10 @@
using namespace mlx::core; using namespace mlx::core;
int main() { int main() {
// Enable the MLX_METAL_DEBUG CMake option to enhance the capture with groups, // To use Metal debugging and profiling:
// labels, etc. // 1. Build with the MLX_METAL_DEBUG CMake option (i.e. -DMLX_METAL_DEBUG=ON).
assert(metal::start_capture()); // 2. Run with MTL_CAPTURE_ENABLED=1.
assert(metal::start_capture("mlx_trace.gputrace"));
// Start at index two because the default GPU and CPU streams have indices // Start at index two because the default GPU and CPU streams have indices
// zero and one, respectively. This naming matches the label assigned to each // zero and one, respectively. This naming matches the label assigned to each

View File

@ -1,6 +1,7 @@
// Copyright © 2023-2024 Apple Inc. // Copyright © 2023-2024 Apple Inc.
#include "mlx/backend/metal/allocator.h" #include "mlx/backend/metal/allocator.h"
#include "mlx/backend/metal/metal.h" #include "mlx/backend/metal/metal.h"
#include "mlx/backend/metal/metal_impl.h"
#include <mach/vm_page_size.h> #include <mach/vm_page_size.h>
#include <unistd.h> #include <unistd.h>

View File

@ -11,6 +11,7 @@
#include "mlx/backend/metal/device.h" #include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/metal.h" #include "mlx/backend/metal/metal.h"
#include "mlx/backend/metal/metal_impl.h"
#include "mlx/backend/metal/mps/gemm.h" #include "mlx/backend/metal/mps/gemm.h"
#include "mlx/backend/metal/utils.h" #include "mlx/backend/metal/utils.h"

View File

@ -1,5 +1,4 @@
// Copyright © 2023-2024 Apple Inc. // Copyright © 2023-2024 Apple Inc.
#include <cstdlib> #include <cstdlib>
#include <future> #include <future>
#include <memory> #include <memory>
@ -16,9 +15,6 @@ bool is_available() {
} }
int max_ops_per_buffer() { int max_ops_per_buffer() {
#ifdef MLX_METAL_DEBUG
return 1;
#else
auto get_val = []() { auto get_val = []() {
if (const char* buff_str = std::getenv("MLX_MAX_OPS_PER_BUFFER")) { if (const char* buff_str = std::getenv("MLX_MAX_OPS_PER_BUFFER")) {
return atoi(buff_str); return atoi(buff_str);
@ -28,7 +24,6 @@ int max_ops_per_buffer() {
}; };
static int max_ops_per_buffer_ = get_val(); static int max_ops_per_buffer_ = get_val();
return max_ops_per_buffer_; return max_ops_per_buffer_;
#endif
} }
#define MAX_OPS_PER_BUFFER max_ops_per_buffer() #define MAX_OPS_PER_BUFFER max_ops_per_buffer()

View File

@ -2,15 +2,11 @@
#pragma once #pragma once
#include <future>
#include <memory>
#include <vector>
#include "mlx/array.h" #include "mlx/array.h"
#include "mlx/stream.h"
namespace mlx::core::metal { namespace mlx::core::metal {
/* Check if the Metal backend is available. */
bool is_available(); bool is_available();
/* Get the actively used memory in bytes. /* Get the actively used memory in bytes.
@ -58,14 +54,6 @@ size_t set_memory_limit(size_t limit, bool relaxed = true);
* */ * */
size_t set_cache_limit(size_t limit); size_t set_cache_limit(size_t limit);
void new_stream(Stream stream);
std::shared_ptr<void> new_scoped_memory_pool();
std::function<void()> make_task(
array& arr,
std::vector<std::shared_future<void>> deps,
std::shared_ptr<std::promise<void>> p);
/** Capture a GPU trace, saving it to an absolute file `path` */ /** Capture a GPU trace, saving it to an absolute file `path` */
bool start_capture(std::string path = ""); bool start_capture(std::string path = "");
void stop_capture(); void stop_capture();

View File

@ -0,0 +1,22 @@
// Copyright © 2023-2024 Apple Inc.
#pragma once
#include <future>
#include <memory>
#include <vector>
#include "mlx/array.h"
#include "mlx/stream.h"
namespace mlx::core::metal {
void new_stream(Stream stream);
std::shared_ptr<void> new_scoped_memory_pool();
std::function<void()> make_task(
array& arr,
std::vector<std::shared_future<void>> deps,
std::shared_ptr<std::promise<void>> p);
} // namespace mlx::core::metal

View File

@ -142,6 +142,9 @@ inline void debug_set_primitive_buffer_label(
Primitive& primitive) { Primitive& primitive) {
#ifdef MLX_METAL_DEBUG #ifdef MLX_METAL_DEBUG
std::ostringstream label; std::ostringstream label;
if (auto cbuf_label = command_buffer->label(); cbuf_label) {
label << cbuf_label->utf8String();
}
primitive.print(label); primitive.print(label);
command_buffer->setLabel(make_string(label)); command_buffer->setLabel(make_string(label));
#endif #endif

View File

@ -3,6 +3,7 @@
#include <stdexcept> #include <stdexcept>
#include "mlx/backend/metal/metal.h" #include "mlx/backend/metal/metal.h"
#include "mlx/backend/metal/metal_impl.h"
namespace mlx::core::metal { namespace mlx::core::metal {

View File

@ -9,6 +9,7 @@
#include <unordered_map> #include <unordered_map>
#include "mlx/backend/metal/metal.h" #include "mlx/backend/metal/metal.h"
#include "mlx/backend/metal/metal_impl.h"
#include "mlx/device.h" #include "mlx/device.h"
#include "mlx/stream.h" #include "mlx/stream.h"

View File

@ -7,7 +7,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "mlx/backend/metal/metal.h" #include "mlx/backend/metal/metal_impl.h"
#include "mlx/ops.h" #include "mlx/ops.h"
#include "mlx/primitives.h" #include "mlx/primitives.h"
#include "mlx/scheduler.h" #include "mlx/scheduler.h"

View File

@ -2,6 +2,7 @@
#include "mlx/backend/metal/metal.h" #include "mlx/backend/metal/metal.h"
#include <nanobind/nanobind.h> #include <nanobind/nanobind.h>
#include <nanobind/stl/string.h>
namespace nb = nanobind; namespace nb = nanobind;
using namespace nb::literals; using namespace nb::literals;
@ -88,4 +89,24 @@ void init_metal(nb::module_& m) {
Returns: Returns:
int: The previous cache limit in bytes. int: The previous cache limit in bytes.
)pbdoc"); )pbdoc");
metal.def(
"start_capture",
&metal::start_capture,
"path"_a,
R"pbdoc(
Start a Metal capture.
Args:
path (str): The path to save the capture which should have
the extension ``.gputrace``.
Returns:
bool: Whether the capture was successfully started.
)pbdoc");
metal.def(
"stop_capture",
&metal::stop_capture,
R"pbdoc(
Stop a Metal capture.
)pbdoc");
} }