mirror of
https://github.com/ml-explore/mlx.git
synced 2025-06-24 17:31:16 +08:00
Improve profiling with gpu tracing (#969)
* improve profiling with gpu tracing * fix for linux * nit * doc fix * fix example
This commit is contained in:
parent
bddf23f175
commit
aac2f9fb61
@ -1,29 +1,46 @@
|
|||||||
Metal Debugger
|
Metal Debugger
|
||||||
==============
|
==============
|
||||||
|
|
||||||
|
.. currentmodule:: mlx.core
|
||||||
|
|
||||||
Profiling is a key step for performance optimization. You can build MLX with
|
Profiling is a key step for performance optimization. You can build MLX with
|
||||||
the ``MLX_METAL_DEBUG`` option to improve the Metal debugging and optimization
|
the ``MLX_METAL_DEBUG`` option to improve the Metal debugging and
|
||||||
workflow. The ``MLX_METAL_DEBUG`` debug option:
|
optimization workflow. The ``MLX_METAL_DEBUG`` debug option:
|
||||||
|
|
||||||
* Records source during Metal compilation, for later inspection while
|
* Records source during Metal compilation, for later inspection while
|
||||||
debugging.
|
debugging.
|
||||||
* Labels Metal objects such as command queues, improving capture readability.
|
* Labels Metal objects such as command queues, improving capture readability.
|
||||||
|
|
||||||
The ``metal::start_capture`` function initiates a capture of all MLX GPU work.
|
To build with debugging enabled in Python prepend
|
||||||
|
``CMAKE_ARGS="-DMLX_METAL_DEBUG=ON"`` to the build call.
|
||||||
|
|
||||||
.. code-block:: C++
|
The :func:`metal.start_capture` function initiates a capture of all MLX GPU
|
||||||
|
work.
|
||||||
|
|
||||||
int main() {
|
.. note::
|
||||||
metal::start_capture("/Users/Jane/Developer/MLX.gputrace");
|
|
||||||
|
|
||||||
auto a = arange(10.f, 20.f, 1.f, float32);
|
To capture a GPU trace you must run the application with
|
||||||
auto b = arange(30.f, 40.f, 1.f, float32);
|
``MTL_CAPTURE_ENABLED=1``.
|
||||||
auto c = add(a, b);
|
|
||||||
|
|
||||||
eval(c);
|
.. code-block:: python
|
||||||
|
|
||||||
metal::stop_capture();
|
import mlx.core as mx
|
||||||
}
|
|
||||||
|
a = mx.random.uniform(shape=(512, 512))
|
||||||
|
b = mx.random.uniform(shape=(512, 512))
|
||||||
|
mx.eval(a, b)
|
||||||
|
|
||||||
|
trace_file = "mlx_trace.gputrace"
|
||||||
|
|
||||||
|
if not mx.metal.start_capture(trace_file):
|
||||||
|
print("Make sure to run with MTL_CAPTURE_ENABLED=1 and "
|
||||||
|
f"that the path {trace_file} does not already exist.")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
for _ in range(10):
|
||||||
|
mx.eval(mx.add(a, b))
|
||||||
|
|
||||||
|
mx.metal.stop_capture()
|
||||||
|
|
||||||
You can open and replay the GPU trace in Xcode. The ``Dependencies`` view
|
You can open and replay the GPU trace in Xcode. The ``Dependencies`` view
|
||||||
has a great overview of all operations. Checkout the `Metal debugger
|
has a great overview of all operations. Checkout the `Metal debugger
|
||||||
@ -35,8 +52,8 @@ documentation`_ for more information.
|
|||||||
Xcode Workflow
|
Xcode Workflow
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
You can skip saving to a path by running within Xcode. First, generate an Xcode
|
You can skip saving to a path by running within Xcode. First, generate an
|
||||||
project using CMake.
|
Xcode project using CMake.
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ Metal
|
|||||||
|
|
||||||
.. currentmodule:: mlx.core.metal
|
.. currentmodule:: mlx.core.metal
|
||||||
|
|
||||||
.. autosummary::
|
.. autosummary::
|
||||||
:toctree: _autosummary
|
:toctree: _autosummary
|
||||||
|
|
||||||
is_available
|
is_available
|
||||||
@ -12,3 +12,5 @@ Metal
|
|||||||
get_cache_memory
|
get_cache_memory
|
||||||
set_memory_limit
|
set_memory_limit
|
||||||
set_cache_limit
|
set_cache_limit
|
||||||
|
start_capture
|
||||||
|
stop_capture
|
||||||
|
@ -8,9 +8,10 @@
|
|||||||
using namespace mlx::core;
|
using namespace mlx::core;
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
// Enable the MLX_METAL_DEBUG CMake option to enhance the capture with groups,
|
// To use Metal debugging and profiling:
|
||||||
// labels, etc.
|
// 1. Build with the MLX_METAL_DEBUG CMake option (i.e. -DMLX_METAL_DEBUG=ON).
|
||||||
assert(metal::start_capture());
|
// 2. Run with MTL_CAPTURE_ENABLED=1.
|
||||||
|
assert(metal::start_capture("mlx_trace.gputrace"));
|
||||||
|
|
||||||
// Start at index two because the default GPU and CPU streams have indices
|
// Start at index two because the default GPU and CPU streams have indices
|
||||||
// zero and one, respectively. This naming matches the label assigned to each
|
// zero and one, respectively. This naming matches the label assigned to each
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
// Copyright © 2023-2024 Apple Inc.
|
// Copyright © 2023-2024 Apple Inc.
|
||||||
#include "mlx/backend/metal/allocator.h"
|
#include "mlx/backend/metal/allocator.h"
|
||||||
#include "mlx/backend/metal/metal.h"
|
#include "mlx/backend/metal/metal.h"
|
||||||
|
#include "mlx/backend/metal/metal_impl.h"
|
||||||
|
|
||||||
#include <mach/vm_page_size.h>
|
#include <mach/vm_page_size.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
|
|
||||||
#include "mlx/backend/metal/device.h"
|
#include "mlx/backend/metal/device.h"
|
||||||
#include "mlx/backend/metal/metal.h"
|
#include "mlx/backend/metal/metal.h"
|
||||||
|
#include "mlx/backend/metal/metal_impl.h"
|
||||||
#include "mlx/backend/metal/mps/gemm.h"
|
#include "mlx/backend/metal/mps/gemm.h"
|
||||||
#include "mlx/backend/metal/utils.h"
|
#include "mlx/backend/metal/utils.h"
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
// Copyright © 2023-2024 Apple Inc.
|
// Copyright © 2023-2024 Apple Inc.
|
||||||
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <future>
|
#include <future>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
@ -16,9 +15,6 @@ bool is_available() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int max_ops_per_buffer() {
|
int max_ops_per_buffer() {
|
||||||
#ifdef MLX_METAL_DEBUG
|
|
||||||
return 1;
|
|
||||||
#else
|
|
||||||
auto get_val = []() {
|
auto get_val = []() {
|
||||||
if (const char* buff_str = std::getenv("MLX_MAX_OPS_PER_BUFFER")) {
|
if (const char* buff_str = std::getenv("MLX_MAX_OPS_PER_BUFFER")) {
|
||||||
return atoi(buff_str);
|
return atoi(buff_str);
|
||||||
@ -28,7 +24,6 @@ int max_ops_per_buffer() {
|
|||||||
};
|
};
|
||||||
static int max_ops_per_buffer_ = get_val();
|
static int max_ops_per_buffer_ = get_val();
|
||||||
return max_ops_per_buffer_;
|
return max_ops_per_buffer_;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MAX_OPS_PER_BUFFER max_ops_per_buffer()
|
#define MAX_OPS_PER_BUFFER max_ops_per_buffer()
|
||||||
|
@ -2,15 +2,11 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <future>
|
|
||||||
#include <memory>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "mlx/array.h"
|
#include "mlx/array.h"
|
||||||
#include "mlx/stream.h"
|
|
||||||
|
|
||||||
namespace mlx::core::metal {
|
namespace mlx::core::metal {
|
||||||
|
|
||||||
|
/* Check if the Metal backend is available. */
|
||||||
bool is_available();
|
bool is_available();
|
||||||
|
|
||||||
/* Get the actively used memory in bytes.
|
/* Get the actively used memory in bytes.
|
||||||
@ -58,14 +54,6 @@ size_t set_memory_limit(size_t limit, bool relaxed = true);
|
|||||||
* */
|
* */
|
||||||
size_t set_cache_limit(size_t limit);
|
size_t set_cache_limit(size_t limit);
|
||||||
|
|
||||||
void new_stream(Stream stream);
|
|
||||||
std::shared_ptr<void> new_scoped_memory_pool();
|
|
||||||
|
|
||||||
std::function<void()> make_task(
|
|
||||||
array& arr,
|
|
||||||
std::vector<std::shared_future<void>> deps,
|
|
||||||
std::shared_ptr<std::promise<void>> p);
|
|
||||||
|
|
||||||
/** Capture a GPU trace, saving it to an absolute file `path` */
|
/** Capture a GPU trace, saving it to an absolute file `path` */
|
||||||
bool start_capture(std::string path = "");
|
bool start_capture(std::string path = "");
|
||||||
void stop_capture();
|
void stop_capture();
|
||||||
|
22
mlx/backend/metal/metal_impl.h
Normal file
22
mlx/backend/metal/metal_impl.h
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
// Copyright © 2023-2024 Apple Inc.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <future>
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "mlx/array.h"
|
||||||
|
#include "mlx/stream.h"
|
||||||
|
|
||||||
|
namespace mlx::core::metal {
|
||||||
|
|
||||||
|
void new_stream(Stream stream);
|
||||||
|
std::shared_ptr<void> new_scoped_memory_pool();
|
||||||
|
|
||||||
|
std::function<void()> make_task(
|
||||||
|
array& arr,
|
||||||
|
std::vector<std::shared_future<void>> deps,
|
||||||
|
std::shared_ptr<std::promise<void>> p);
|
||||||
|
|
||||||
|
} // namespace mlx::core::metal
|
@ -142,6 +142,9 @@ inline void debug_set_primitive_buffer_label(
|
|||||||
Primitive& primitive) {
|
Primitive& primitive) {
|
||||||
#ifdef MLX_METAL_DEBUG
|
#ifdef MLX_METAL_DEBUG
|
||||||
std::ostringstream label;
|
std::ostringstream label;
|
||||||
|
if (auto cbuf_label = command_buffer->label(); cbuf_label) {
|
||||||
|
label << cbuf_label->utf8String();
|
||||||
|
}
|
||||||
primitive.print(label);
|
primitive.print(label);
|
||||||
command_buffer->setLabel(make_string(label));
|
command_buffer->setLabel(make_string(label));
|
||||||
#endif
|
#endif
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
#include "mlx/backend/metal/metal.h"
|
#include "mlx/backend/metal/metal.h"
|
||||||
|
#include "mlx/backend/metal/metal_impl.h"
|
||||||
|
|
||||||
namespace mlx::core::metal {
|
namespace mlx::core::metal {
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
#include "mlx/backend/metal/metal.h"
|
#include "mlx/backend/metal/metal.h"
|
||||||
|
#include "mlx/backend/metal/metal_impl.h"
|
||||||
#include "mlx/device.h"
|
#include "mlx/device.h"
|
||||||
#include "mlx/stream.h"
|
#include "mlx/stream.h"
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
|
||||||
#include "mlx/backend/metal/metal.h"
|
#include "mlx/backend/metal/metal_impl.h"
|
||||||
#include "mlx/ops.h"
|
#include "mlx/ops.h"
|
||||||
#include "mlx/primitives.h"
|
#include "mlx/primitives.h"
|
||||||
#include "mlx/scheduler.h"
|
#include "mlx/scheduler.h"
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include "mlx/backend/metal/metal.h"
|
#include "mlx/backend/metal/metal.h"
|
||||||
#include <nanobind/nanobind.h>
|
#include <nanobind/nanobind.h>
|
||||||
|
#include <nanobind/stl/string.h>
|
||||||
|
|
||||||
namespace nb = nanobind;
|
namespace nb = nanobind;
|
||||||
using namespace nb::literals;
|
using namespace nb::literals;
|
||||||
@ -88,4 +89,24 @@ void init_metal(nb::module_& m) {
|
|||||||
Returns:
|
Returns:
|
||||||
int: The previous cache limit in bytes.
|
int: The previous cache limit in bytes.
|
||||||
)pbdoc");
|
)pbdoc");
|
||||||
|
metal.def(
|
||||||
|
"start_capture",
|
||||||
|
&metal::start_capture,
|
||||||
|
"path"_a,
|
||||||
|
R"pbdoc(
|
||||||
|
Start a Metal capture.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): The path to save the capture which should have
|
||||||
|
the extension ``.gputrace``.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: Whether the capture was successfully started.
|
||||||
|
)pbdoc");
|
||||||
|
metal.def(
|
||||||
|
"stop_capture",
|
||||||
|
&metal::stop_capture,
|
||||||
|
R"pbdoc(
|
||||||
|
Stop a Metal capture.
|
||||||
|
)pbdoc");
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user