redesign for faster cpu/gpu synch (#1869)

* redesign for faster cpu/gpu synch * load + more async CPU * use command encoder API and move more ops to use it * make fence back-end generic + CPU only fence * faster build * fix async eval * fixes + handle temporaries * fix / improve cpu conv * remove unused status, fix siblings * fix extensions * fix * fix no cpu build * format * comments * fix perf regression, remove unecessary abort * fix events, task limit cpu * fix waiting * fix donation / temporaries in normalization
2025-12-16 01:49:05 +08:00 · 2025-03-06 19:23:38 -08:00
parent 5245f12a46
commit c4230747a1
103 changed files with 5013 additions and 3873 deletions
--- a/mlx/backend/metal/scaled_dot_product_attention.cpp
+++ b/mlx/backend/metal/scaled_dot_product_attention.cpp
@@ -152,7 +152,7 @@ void sdpa_vector(
  compute_encoder.set_compute_pipeline_state(kernel);

  // Set its arguments
-  compute_encoder.set_input_array(q.data_shared_ptr() == nullptr ? out : q, 0);
+  compute_encoder.set_input_array(q, 0);
  compute_encoder.set_input_array(k, 1);
  compute_encoder.set_input_array(v, 2);
  compute_encoder.set_output_array(out, 3);
@@ -242,7 +242,7 @@ void sdpa_vector_2pass(
  compute_encoder.set_compute_pipeline_state(kernel);

  // Set its arguments
-  compute_encoder.set_input_array(q.data_shared_ptr() == nullptr ? out : q, 0);
+  compute_encoder.set_input_array(q, 0);
  compute_encoder.set_input_array(k, 1);
  compute_encoder.set_input_array(v, 2);
  compute_encoder.set_output_array(intermediate, 3);
@@ -357,7 +357,7 @@ void ScaledDotProductAttention::eval_gpu(
    // Donate the query if possible
    if (q.is_donatable() && (q.shape(2) == 1 || !q.flags().row_contiguous) &&
        q.size() == o.size()) {
-      o.move_shared_buffer(q);
+      o.copy_shared_buffer(q);
    } else {
      if (o.shape(2) == 1) {
        o.set_data(allocator::malloc_or_wait(o.nbytes()));