Use async cuda malloc managed with cuda 13

2025-12-16 01:49:05 +08:00 · 2025-10-26 16:17:27 -07:00
parent 74c1ed25bb
commit 764b4b7ce8
19 changed files with 110 additions and 54 deletions
--- a/mlx/backend/cuda/arange.cu
+++ b/mlx/backend/cuda/arange.cu
@@ -41,9 +41,8 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  if (out.size() == 0) {
    return;
  }
-  out.set_data(allocator::malloc(out.nbytes()));
-
  auto& encoder = cu::get_command_encoder(stream());
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  encoder.set_output_array(out);

  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {