[CUDA] Save primitive inputs faster (#2449)

* Add more nvtx loggings * [CUDA] Saving primitive inputs faster * Remove unneeded check
2025-12-16 01:49:05 +08:00 · 2025-08-01 10:16:06 +09:00
parent 86c6a15571
commit b26d88591c
5 changed files with 11 additions and 9 deletions
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -192,7 +192,7 @@ void ternary_op_gpu(
 }

 void Select::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("select::eval_gpu");
+  nvtx3::scoped_range r("Select::eval_gpu");
  auto& s = out.primitive().stream();
  ternary_op_gpu<cu::Select>(inputs, out, s);
 }