fix metal scan (#2591)

This commit is contained in:
Awni Hannun
2025-09-15 11:01:57 -07:00
committed by GitHub
parent 36cad99a11
commit 6ccfa603cd
2 changed files with 18 additions and 11 deletions

View File

@@ -306,6 +306,7 @@ template <
U prev_thread = op.simd_exclusive_scan(values[N_READS - 1]);
// Write simdgroup_sums to SM
threadgroup_barrier(mem_flags::mem_threadgroup);
if (simd_lane_id == simd_size - 1) {
simdgroup_sums[simd_group_id] = op(prev_thread, values[N_READS - 1]);
}
@@ -440,6 +441,7 @@ template <
}
// Read in SM
threadgroup_barrier(mem_flags::mem_threadgroup);
if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
for (int i = 0; i < N_READS; i++) {
read_into[i] = in[index_y * stride + i];

View File

@@ -36,14 +36,6 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
bool contiguous = in.strides()[axis_] == 1;
std::ostringstream kname;
kname << (contiguous ? "contig_" : "strided_");
kname << "scan_";
if (reverse_) {
kname << "reverse_";
}
kname << ((inclusive_) ? "inclusive_" : "exclusive_");
std::string reduce_type;
switch (reduce_type_) {
case Scan::Sum:
@@ -62,9 +54,22 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
reduce_type = "logaddexp";
break;
}
kname << reduce_type << "_" << type_to_name(in) << "_" << type_to_name(out);
auto kernel = get_scan_kernel(
d, kname.str(), reverse_, inclusive_, reduce_type, in, out);
std::string kname;
concatenate(
kname,
contiguous ? "contig_" : "strided_",
"scan_",
reverse_ ? "reverse_" : "",
(inclusive_) ? "inclusive_" : "exclusive_",
reduce_type,
"_",
type_to_name(in),
"_",
type_to_name(out));
auto kernel =
get_scan_kernel(d, kname, reverse_, inclusive_, reduce_type, in, out);
if (contiguous) {
auto& compute_encoder = d.get_command_encoder(s.index);