[CUDA] Reduce use of managed memory (#2725)

* Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment
2025-12-16 01:49:05 +08:00 · 2025-11-05 16:05:23 -08:00
parent 27778156dc
commit df58b4133a
79 changed files with 795 additions and 515 deletions
--- a/mlx/backend/cuda/rms_norm.cu
+++ b/mlx/backend/cuda/rms_norm.cu
@@ -176,9 +176,10 @@ void RMSNorm::eval_gpu(
  nvtx3::scoped_range r("RMSNorm::eval_gpu");
  auto& s = stream();
  auto& out = outputs[0];
+  auto& encoder = cu::get_command_encoder(s);

  // Make sure that the last dimension is contiguous.
-  auto set_output = [&s, &out](const array& x) {
+  auto set_output = [&s, &out, &encoder](const array& x) {
    bool no_copy = x.flags().contiguous && x.strides()[x.ndim() - 1] == 1;
    if (no_copy && x.ndim() > 1) {
      auto s = x.strides()[x.ndim() - 2];
@@ -189,7 +190,7 @@ void RMSNorm::eval_gpu(
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
-            allocator::malloc(x.data_size() * x.itemsize()),
+            cu::malloc_async(x.data_size() * x.itemsize(), encoder.stream()),
            x.data_size(),
            x.strides(),
            x.flags());
@@ -209,7 +210,6 @@ void RMSNorm::eval_gpu(
  int32_t n_rows = x.data_size() / axis_size;
  int64_t w_stride = (w.ndim() == 1) ? w.strides()[0] : 0;

-  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_output_array(out);
@@ -223,9 +223,9 @@ void RMSNorm::eval_gpu(
          n_rows,
          block_dim(),
          0,
-          x.data<DataType>(),
-          w.data<DataType>(),
-          out.data<DataType>(),
+          gpu_ptr<DataType>(x),
+          gpu_ptr<DataType>(w),
+          gpu_ptr<DataType>(out),
          eps_,
          axis_size,
          w_stride);
@@ -274,7 +274,7 @@ void RMSNormVJP::eval_gpu(
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
-    gx.set_data(allocator::malloc(gx.nbytes()));
+    gx.set_data(cu::malloc_async(gx.nbytes(), encoder.stream()));
  }
  if (g_copied && !g_in_gx) {
    encoder.add_temporary(g);
@@ -292,7 +292,7 @@ void RMSNormVJP::eval_gpu(
    if (!g_in_gx && donate_g) {
      gw_temp.copy_shared_buffer(g);
    } else {
-      gw_temp.set_data(allocator::malloc(gw_temp.nbytes()));
+      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder.stream()));
      encoder.add_temporary(gw_temp);
    }
  }
@@ -318,11 +318,11 @@ void RMSNormVJP::eval_gpu(
                n_rows,
                block_dim(),
                0,
-                x.data<DataType>(),
-                w.data<DataType>(),
-                g.data<DataType>(),
-                gx.data<DataType>(),
-                gw_temp.data<DataType>(),
+                gpu_ptr<DataType>(x),
+                gpu_ptr<DataType>(w),
+                gpu_ptr<DataType>(g),
+                gpu_ptr<DataType>(gx),
+                gpu_ptr<DataType>(gw_temp),
                eps_,
                axis_size,
                w_stride);