mirror of
https://github.com/ml-explore/mlx.git
synced 2025-09-27 08:18:30 +08:00
fix metal scan (#2591)
This commit is contained in:
@@ -306,6 +306,7 @@ template <
|
|||||||
U prev_thread = op.simd_exclusive_scan(values[N_READS - 1]);
|
U prev_thread = op.simd_exclusive_scan(values[N_READS - 1]);
|
||||||
|
|
||||||
// Write simdgroup_sums to SM
|
// Write simdgroup_sums to SM
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (simd_lane_id == simd_size - 1) {
|
if (simd_lane_id == simd_size - 1) {
|
||||||
simdgroup_sums[simd_group_id] = op(prev_thread, values[N_READS - 1]);
|
simdgroup_sums[simd_group_id] = op(prev_thread, values[N_READS - 1]);
|
||||||
}
|
}
|
||||||
@@ -440,6 +441,7 @@ template <
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Read in SM
|
// Read in SM
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
|
if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
|
||||||
for (int i = 0; i < N_READS; i++) {
|
for (int i = 0; i < N_READS; i++) {
|
||||||
read_into[i] = in[index_y * stride + i];
|
read_into[i] = in[index_y * stride + i];
|
||||||
|
@@ -36,14 +36,6 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
|
|||||||
|
|
||||||
bool contiguous = in.strides()[axis_] == 1;
|
bool contiguous = in.strides()[axis_] == 1;
|
||||||
|
|
||||||
std::ostringstream kname;
|
|
||||||
kname << (contiguous ? "contig_" : "strided_");
|
|
||||||
kname << "scan_";
|
|
||||||
if (reverse_) {
|
|
||||||
kname << "reverse_";
|
|
||||||
}
|
|
||||||
kname << ((inclusive_) ? "inclusive_" : "exclusive_");
|
|
||||||
|
|
||||||
std::string reduce_type;
|
std::string reduce_type;
|
||||||
switch (reduce_type_) {
|
switch (reduce_type_) {
|
||||||
case Scan::Sum:
|
case Scan::Sum:
|
||||||
@@ -62,9 +54,22 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
|
|||||||
reduce_type = "logaddexp";
|
reduce_type = "logaddexp";
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
kname << reduce_type << "_" << type_to_name(in) << "_" << type_to_name(out);
|
|
||||||
auto kernel = get_scan_kernel(
|
std::string kname;
|
||||||
d, kname.str(), reverse_, inclusive_, reduce_type, in, out);
|
concatenate(
|
||||||
|
kname,
|
||||||
|
contiguous ? "contig_" : "strided_",
|
||||||
|
"scan_",
|
||||||
|
reverse_ ? "reverse_" : "",
|
||||||
|
(inclusive_) ? "inclusive_" : "exclusive_",
|
||||||
|
reduce_type,
|
||||||
|
"_",
|
||||||
|
type_to_name(in),
|
||||||
|
"_",
|
||||||
|
type_to_name(out));
|
||||||
|
|
||||||
|
auto kernel =
|
||||||
|
get_scan_kernel(d, kname, reverse_, inclusive_, reduce_type, in, out);
|
||||||
|
|
||||||
if (contiguous) {
|
if (contiguous) {
|
||||||
auto& compute_encoder = d.get_command_encoder(s.index);
|
auto& compute_encoder = d.get_command_encoder(s.index);
|
||||||
|
Reference in New Issue
Block a user