Refactor reductions and fix scatter atomics for large sizes (#1300)

Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com>
2025-12-16 01:49:05 +08:00 · 2024-08-22 16:03:31 -07:00
parent f9e00efe31
commit 98b6ce3460
18 changed files with 1584 additions and 1235 deletions
--- a/mlx/backend/metal/kernels/reduction/ops.h
+++ b/mlx/backend/metal/kernels/reduction/ops.h
@@ -5,6 +5,20 @@
 #include <metal_atomic>
 #include <metal_simdgroup>

+#define DEFINE_SIMD_REDUCE()                                             \
+  template <typename T, metal::enable_if_t<sizeof(T) < 8, bool> = true>  \
+  T simd_reduce(T val) {                                                 \
+    return simd_reduce_impl(val);                                        \
+  }                                                                      \
+                                                                         \
+  template <typename T, metal::enable_if_t<sizeof(T) == 8, bool> = true> \
+  T simd_reduce(T val) {                                                 \
+    for (short i = simd_size / 2; i > 0; i /= 2) {                       \
+      val = operator()(val, simd_shuffle_down(val, i));                  \
+    }                                                                    \
+    return val;                                                          \
+  }
+
 static constant constexpr const uint8_t simd_size = 32;

 union bool4_or_uint {
@@ -14,14 +28,16 @@ union bool4_or_uint {

 struct None {
  template <typename T>
-  void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
+  void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
    mlx_atomic_store_explicit(out, val, offset);
  }
 };

 template <typename U = bool>
 struct And {
-  bool simd_reduce(bool val) {
+  DEFINE_SIMD_REDUCE()
+
+  bool simd_reduce_impl(bool val) {
    return simd_all(val);
  }

@@ -31,7 +47,7 @@ struct And {
      device mlx_atomic<unsigned int>* out,
      bool val,
      int elem_idx,
-      int offset = 0) {
+      size_t offset = 0) {
    if (!val) {
      bool4_or_uint update;
      update.b = {true, true, true, true};
@@ -40,7 +56,8 @@ struct And {
    }
  }

-  void atomic_update(device mlx_atomic<bool>* out, bool val, uint offset = 0) {
+  void
+  atomic_update(device mlx_atomic<bool>* out, bool val, size_t offset = 0) {
    if (!val) {
      mlx_atomic_store_explicit(out, val, offset);
    }
@@ -59,7 +76,9 @@ struct And {

 template <typename U = bool>
 struct Or {
-  bool simd_reduce(bool val) {
+  DEFINE_SIMD_REDUCE()
+
+  bool simd_reduce_impl(bool val) {
    return simd_any(val);
  }

@@ -68,8 +87,8 @@ struct Or {
  void atomic_update(
      device mlx_atomic<unsigned int>* out,
      bool val,
-      uint elem_idx,
-      uint offset = 0) {
+      int elem_idx,
+      size_t offset = 0) {
    if (val) {
      bool4_or_uint update;
      update.b = {false, false, false, false};
@@ -78,7 +97,8 @@ struct Or {
    }
  }

-  void atomic_update(device mlx_atomic<bool>* out, bool val, uint offset = 0) {
+  void
+  atomic_update(device mlx_atomic<bool>* out, bool val, size_t offset = 0) {
    if (val) {
      mlx_atomic_store_explicit(out, val, offset);
    }
@@ -97,15 +117,17 @@ struct Or {

 template <typename U>
 struct Sum {
+  DEFINE_SIMD_REDUCE()
+
  template <typename T>
-  T simd_reduce(T val) {
+  T simd_reduce_impl(T val) {
    return simd_sum(val);
  }

  static constexpr constant U init = U(0);

  template <typename T>
-  void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
+  void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
    mlx_atomic_fetch_add_explicit(out, val, offset);
  }

@@ -117,15 +139,17 @@ struct Sum {

 template <typename U>
 struct Prod {
+  DEFINE_SIMD_REDUCE()
+
  template <typename T>
-  T simd_reduce(T val) {
+  T simd_reduce_impl(T val) {
    return simd_product(val);
  }

  static constexpr constant U init = U(1);

  template <typename T>
-  void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
+  void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
    mlx_atomic_fetch_mul_explicit(out, val, offset);
  }

@@ -137,15 +161,17 @@ struct Prod {

 template <typename U>
 struct Min {
+  DEFINE_SIMD_REDUCE()
+
  template <typename T>
-  T simd_reduce(T val) {
+  T simd_reduce_impl(T val) {
    return simd_min(val);
  }

  static constexpr constant U init = Limits<U>::max;

  template <typename T>
-  void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
+  void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
    mlx_atomic_fetch_min_explicit(out, val, offset);
  }

@@ -157,15 +183,17 @@ struct Min {

 template <typename U>
 struct Max {
+  DEFINE_SIMD_REDUCE()
+
  template <typename T>
-  T simd_reduce(T val) {
+  T simd_reduce_impl(T val) {
    return simd_max(val);
  }

  static constexpr constant U init = Limits<U>::min;

  template <typename T>
-  void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
+  void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
    mlx_atomic_fetch_max_explicit(out, val, offset);
  }