Add memory_efficient_threshold kwarg to sdpa kernel (#1319)

Allows opt-in to memory efficient GPU shader at proscribed sequence length. Otherwise, utilizes aggregate MLX primitives for best latency.
2025-12-11 23:14:50 +08:00 · 2024-08-12 15:57:09 -04:00
parent 9231617eb3
commit 19fb69e2ed
4 changed files with 13 additions and 4 deletions
--- a/python/src/fast.cpp
+++ b/python/src/fast.cpp
@@ -112,6 +112,7 @@ void init_fast(nb::module_& parent_module) {
      nb::kw_only(),
      "scale"_a,
      "mask"_a = nb::none(),
+      "memory_efficient_threshold"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def scaled_dot_product_attention(q: array, k: array, v: array, *, scale: float,  mask: Union[None, array] = None, stream: Union[None, Stream, Device] = None) -> array"),
--- a/python/tests/test_fast_sdpa.py
+++ b/python/tests/test_fast_sdpa.py
@@ -86,7 +86,7 @@ class TestFastSelfAttentionSDPA(mlx_tests.MLXTestCase):

                reference = mlx_primitives_sdpa_with_gqa(q_mlx, k_mlx, v_mlx, scale)
                o_mlx = mx.fast.scaled_dot_product_attention(
-                    q_mlx, k_mlx, v_mlx, scale=scale
+                    q_mlx, k_mlx, v_mlx, scale=scale, memory_efficient_threshold=2
                )

                self.assertListEqual(list(reference.shape), list(o_mlx.shape))