Fast RMS Norm (#862)

* fast rmsnorm * no rms gpu * kernel * fix shared mem * looped rms and donation in softmax * Make the squaring in float32 to avoid underflow * Fix the default StreamOrDevice for rope and rms_norm in fast * nits --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com>
2025-12-16 01:49:05 +08:00 · 2024-03-21 07:20:54 -07:00
parent 4650d94d98
commit a54f06b16f
17 changed files with 493 additions and 41 deletions
--- a/mlx/backend/metal/softmax.cpp
+++ b/mlx/backend/metal/softmax.cpp
@@ -37,11 +37,15 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
    }
  };
  const array& in = check_input(inputs[0]);
-  out.set_data(
-      allocator::malloc_or_wait(in.data_size() * in.itemsize()),
-      in.data_size(),
-      in.strides(),
-      in.flags());
+  if (in.is_donatable()) {
+    out.move_shared_buffer(in);
+  } else {
+    out.set_data(
+        allocator::malloc_or_wait(in.data_size() * in.itemsize()),
+        in.data_size(),
+        in.strides(),
+        in.flags());
+  }

  int axis_size = in.shape().back();
  int n_rows = in.data_size() / axis_size;
@@ -75,6 +79,8 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
    }

    compute_encoder->setComputePipelineState(kernel);
+    set_array_buffer(
+        compute_encoder, in.data_shared_ptr() == nullptr ? out : in, 0);
    set_array_buffer(compute_encoder, in, 0);
    set_array_buffer(compute_encoder, out, 1);
    compute_encoder->setBytes(&axis_size, sizeof(int), 2);