cpu fallback

2025-12-16 01:49:05 +08:00 · 2024-12-06 01:22:50 -08:00
parent c89ddf62b4
commit 769704653a
2 changed files with 2 additions and 2 deletions
--- a/mlx/fast.cpp
+++ b/mlx/fast.cpp
@@ -792,7 +792,8 @@ array quantized_scaled_dot_product_attention(

  int query_head_dim = queries.shape(-1);
  int L = queries.shape(2);
-  if (L > 1 && query_head_dim != 64 && query_head_dim != 128) {
+  bool compatible_head_dim = query_head_dim == 64 || query_head_dim == 128;
+  if (L > 1 || !compatible_head_dim || stream.device != Device::gpu) {
    if (needs_mask) {
      return fallback(
          {queries,
--- a/python/tests/test_fast_sdpa.py
+++ b/python/tests/test_fast_sdpa.py
@@ -168,7 +168,6 @@ class TestFastSDPA(mlx_tests.MLXTestCase):
                if dtype == mx.float16:
                    rtol = 1e-2

-                # np.testing.assert_allclose(o_q, reference, rtol=rtol, atol=atol)
                self.assertTrue(mx.allclose(o_q, reference, rtol=rtol, atol=atol))
                self.assertTrue(mx.allclose(o, reference, rtol=rtol, atol=atol))