Add more nvtx loggings

This commit is contained in:
Cheng
2025-07-31 03:35:41 -07:00
parent d32519c8ee
commit 5140cbe4ac
4 changed files with 6 additions and 1 deletions

View File

@@ -269,6 +269,7 @@ void CommandEncoder::add_graph_node(cudaGraph_t child) {
} }
void CommandEncoder::commit() { void CommandEncoder::commit() {
nvtx3::scoped_range r("CommandEncoder::commit");
if (!temporaries_.empty()) { if (!temporaries_.empty()) {
add_completed_handler([temporaries = std::move(temporaries_)]() {}); add_completed_handler([temporaries = std::move(temporaries_)]() {});
} }

View File

@@ -5,6 +5,8 @@
#include "mlx/backend/gpu/copy.h" #include "mlx/backend/gpu/copy.h"
#include "mlx/fast_primitives.h" #include "mlx/fast_primitives.h"
#include <nvtx3/nvtx3.hpp>
namespace mlx::core { namespace mlx::core {
namespace { namespace {
@@ -42,6 +44,7 @@ inline array ensure_row_contiguous_matrix(
void fast::AffineQuantize::eval_gpu( void fast::AffineQuantize::eval_gpu(
const std::vector<array>& inputs, const std::vector<array>& inputs,
std::vector<array>& outputs) { std::vector<array>& outputs) {
nvtx3::scoped_range r("AffineQuantize::eval_gpu");
auto& s = stream(); auto& s = stream();
auto& d = cu::device(s.device); auto& d = cu::device(s.device);
auto& enc = d.get_command_encoder(s); auto& enc = d.get_command_encoder(s);

View File

@@ -192,7 +192,7 @@ void ternary_op_gpu(
} }
void Select::eval_gpu(const std::vector<array>& inputs, array& out) { void Select::eval_gpu(const std::vector<array>& inputs, array& out) {
nvtx3::scoped_range r("select::eval_gpu"); nvtx3::scoped_range r("Select::eval_gpu");
auto& s = out.primitive().stream(); auto& s = out.primitive().stream();
ternary_op_gpu<cu::Select>(inputs, out, s); ternary_op_gpu<cu::Select>(inputs, out, s);
} }

View File

@@ -133,6 +133,7 @@ void NumberOfElements::eval_gpu(const std::vector<array>& inputs, array& out) {
} }
void Pad::eval_gpu(const std::vector<array>& inputs, array& out) { void Pad::eval_gpu(const std::vector<array>& inputs, array& out) {
MLX_PROFILER_RANGE("Pad::eval_gpu");
// Inputs must be base input array and scalar val array // Inputs must be base input array and scalar val array
assert(inputs.size() == 2); assert(inputs.size() == 2);
auto& in = inputs[0]; auto& in = inputs[0];