Add more nvtx range for debug

2025-10-31 07:58:14 +08:00 · 2025-08-01 12:49:24 -07:00
parent e74bcdc5e3
commit c28249b81a
1 changed files with 8 additions and 0 deletions
--- a/mlx/backend/cuda/scaled_dot_product_attention.cu
+++ b/mlx/backend/cuda/scaled_dot_product_attention.cu
@@ -67,6 +67,8 @@ std::shared_ptr<fe::graph::Graph> get_sdpa_forward_graph(
    return it->second;
  }

+  nvtx3::scoped_range r("get_sdpa_forward_graph");
+
  // Set up new graph
  auto graph = std::make_shared<fe::graph::Graph>();

@@ -141,6 +143,8 @@ std::shared_ptr<fe::graph::Graph> get_sdpa_forward_graph(

  // cuDNN only supports native CUDA graphs for sdpa in 9.6 or above.
  if (cudnnGetVersion() < 90600) {
+    nvtx3::scoped_range r("get_sdpa_forward_graph::graph_building");
+
    auto build_status = graph->build(handle, {fe::HeurMode_t::A});
    if (!build_status.is_good()) {
      throw std::runtime_error(
@@ -437,6 +441,10 @@ void ScaledDotProductAttention::eval_gpu(
    const auto& k = copy_unless(is_matrix_contiguous, k_pre);
    const auto& v = copy_unless(is_matrix_contiguous, v_pre);

+    for (const auto& cp : copies) {
+      encoder.add_temporary(cp);
+    }
+
    int64_t str_oD = 1;
    int64_t str_oH = o.shape(3);
    int64_t str_oL = o.shape(1) * str_oH;