mirror of
https://github.com/ml-explore/mlx.git
synced 2025-12-16 01:49:05 +08:00
Refactor reductions and fix scatter atomics for large sizes (#1300)
Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com>
This commit is contained in:
@@ -5,6 +5,20 @@
|
||||
#include <metal_atomic>
|
||||
#include <metal_simdgroup>
|
||||
|
||||
#define DEFINE_SIMD_REDUCE() \
|
||||
template <typename T, metal::enable_if_t<sizeof(T) < 8, bool> = true> \
|
||||
T simd_reduce(T val) { \
|
||||
return simd_reduce_impl(val); \
|
||||
} \
|
||||
\
|
||||
template <typename T, metal::enable_if_t<sizeof(T) == 8, bool> = true> \
|
||||
T simd_reduce(T val) { \
|
||||
for (short i = simd_size / 2; i > 0; i /= 2) { \
|
||||
val = operator()(val, simd_shuffle_down(val, i)); \
|
||||
} \
|
||||
return val; \
|
||||
}
|
||||
|
||||
static constant constexpr const uint8_t simd_size = 32;
|
||||
|
||||
union bool4_or_uint {
|
||||
@@ -14,14 +28,16 @@ union bool4_or_uint {
|
||||
|
||||
struct None {
|
||||
template <typename T>
|
||||
void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
|
||||
void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
|
||||
mlx_atomic_store_explicit(out, val, offset);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename U = bool>
|
||||
struct And {
|
||||
bool simd_reduce(bool val) {
|
||||
DEFINE_SIMD_REDUCE()
|
||||
|
||||
bool simd_reduce_impl(bool val) {
|
||||
return simd_all(val);
|
||||
}
|
||||
|
||||
@@ -31,7 +47,7 @@ struct And {
|
||||
device mlx_atomic<unsigned int>* out,
|
||||
bool val,
|
||||
int elem_idx,
|
||||
int offset = 0) {
|
||||
size_t offset = 0) {
|
||||
if (!val) {
|
||||
bool4_or_uint update;
|
||||
update.b = {true, true, true, true};
|
||||
@@ -40,7 +56,8 @@ struct And {
|
||||
}
|
||||
}
|
||||
|
||||
void atomic_update(device mlx_atomic<bool>* out, bool val, uint offset = 0) {
|
||||
void
|
||||
atomic_update(device mlx_atomic<bool>* out, bool val, size_t offset = 0) {
|
||||
if (!val) {
|
||||
mlx_atomic_store_explicit(out, val, offset);
|
||||
}
|
||||
@@ -59,7 +76,9 @@ struct And {
|
||||
|
||||
template <typename U = bool>
|
||||
struct Or {
|
||||
bool simd_reduce(bool val) {
|
||||
DEFINE_SIMD_REDUCE()
|
||||
|
||||
bool simd_reduce_impl(bool val) {
|
||||
return simd_any(val);
|
||||
}
|
||||
|
||||
@@ -68,8 +87,8 @@ struct Or {
|
||||
void atomic_update(
|
||||
device mlx_atomic<unsigned int>* out,
|
||||
bool val,
|
||||
uint elem_idx,
|
||||
uint offset = 0) {
|
||||
int elem_idx,
|
||||
size_t offset = 0) {
|
||||
if (val) {
|
||||
bool4_or_uint update;
|
||||
update.b = {false, false, false, false};
|
||||
@@ -78,7 +97,8 @@ struct Or {
|
||||
}
|
||||
}
|
||||
|
||||
void atomic_update(device mlx_atomic<bool>* out, bool val, uint offset = 0) {
|
||||
void
|
||||
atomic_update(device mlx_atomic<bool>* out, bool val, size_t offset = 0) {
|
||||
if (val) {
|
||||
mlx_atomic_store_explicit(out, val, offset);
|
||||
}
|
||||
@@ -97,15 +117,17 @@ struct Or {
|
||||
|
||||
template <typename U>
|
||||
struct Sum {
|
||||
DEFINE_SIMD_REDUCE()
|
||||
|
||||
template <typename T>
|
||||
T simd_reduce(T val) {
|
||||
T simd_reduce_impl(T val) {
|
||||
return simd_sum(val);
|
||||
}
|
||||
|
||||
static constexpr constant U init = U(0);
|
||||
|
||||
template <typename T>
|
||||
void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
|
||||
void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
|
||||
mlx_atomic_fetch_add_explicit(out, val, offset);
|
||||
}
|
||||
|
||||
@@ -117,15 +139,17 @@ struct Sum {
|
||||
|
||||
template <typename U>
|
||||
struct Prod {
|
||||
DEFINE_SIMD_REDUCE()
|
||||
|
||||
template <typename T>
|
||||
T simd_reduce(T val) {
|
||||
T simd_reduce_impl(T val) {
|
||||
return simd_product(val);
|
||||
}
|
||||
|
||||
static constexpr constant U init = U(1);
|
||||
|
||||
template <typename T>
|
||||
void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
|
||||
void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
|
||||
mlx_atomic_fetch_mul_explicit(out, val, offset);
|
||||
}
|
||||
|
||||
@@ -137,15 +161,17 @@ struct Prod {
|
||||
|
||||
template <typename U>
|
||||
struct Min {
|
||||
DEFINE_SIMD_REDUCE()
|
||||
|
||||
template <typename T>
|
||||
T simd_reduce(T val) {
|
||||
T simd_reduce_impl(T val) {
|
||||
return simd_min(val);
|
||||
}
|
||||
|
||||
static constexpr constant U init = Limits<U>::max;
|
||||
|
||||
template <typename T>
|
||||
void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
|
||||
void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
|
||||
mlx_atomic_fetch_min_explicit(out, val, offset);
|
||||
}
|
||||
|
||||
@@ -157,15 +183,17 @@ struct Min {
|
||||
|
||||
template <typename U>
|
||||
struct Max {
|
||||
DEFINE_SIMD_REDUCE()
|
||||
|
||||
template <typename T>
|
||||
T simd_reduce(T val) {
|
||||
T simd_reduce_impl(T val) {
|
||||
return simd_max(val);
|
||||
}
|
||||
|
||||
static constexpr constant U init = Limits<U>::min;
|
||||
|
||||
template <typename T>
|
||||
void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
|
||||
void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
|
||||
mlx_atomic_fetch_max_explicit(out, val, offset);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user