|
template<typename T , typename U , typename Op , int N_READS = REDUCE_N_READS, int N_WRITES = REDUCE_N_WRITES> |
METAL_FUNC void | per_thread_row_reduce (thread U totals[N_WRITES], const device T *inputs[N_WRITES], int blocks, int extra, uint lsize_x, uint lid_x) |
| The thread group collaboratively reduces across the rows with bounds checking.
|
|
template<typename T , typename U , typename Op , int N_READS = REDUCE_N_READS, int N_WRITES = REDUCE_N_WRITES> |
METAL_FUNC void | per_thread_row_reduce (thread U totals[N_WRITES], const device T *in, const constant size_t &reduction_size, int blocks, int extra, uint lsize_x, uint lid_x) |
| Consecutive rows in a contiguous array.
|
|
template<typename T , typename U , typename Op , int N_READS = REDUCE_N_READS, int N_WRITES = REDUCE_N_WRITES> |
METAL_FUNC void | per_thread_row_reduce (thread U totals[N_WRITES], const device T *in, const size_t row_idx, int blocks, int extra, const constant int *shape, const constant size_t *strides, const constant int &ndim, uint lsize_x, uint lid_x) |
| Consecutive rows in an arbitrarily ordered array.
|
|
template<typename T , typename U , typename Op , int N_READS = REDUCE_N_READS, int N_WRITES = REDUCE_N_WRITES> |
METAL_FUNC void | threadgroup_reduce (thread U totals[N_WRITES], threadgroup U *shared_vals, uint3 lid, uint simd_lane_id, uint simd_per_group, uint simd_group_id) |
| Reduce within the threadgroup.
|
|
template<typename T , typename U , typename Op , int N_READS = REDUCE_N_READS> |
METAL_FUNC void | thread_reduce (thread U &total, const device T *row, int blocks, int extra) |
|
template<typename T , typename U , typename Op , int NDIMS = 0, int N_READS = REDUCE_N_READS> |
void | row_reduce_small (const device T *in, device U *out, const constant size_t &row_size, const constant size_t &non_row_reductions, const constant int *shape, const constant size_t *strides, const constant int &ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &reduce_ndim, uint simd_lane_id, uint3 gid, uint3 gsize, uint3 tid, uint3 tsize) |
|
template<typename T , typename U , typename Op , int N_READS = REDUCE_N_READS, int N_WRITES = REDUCE_N_WRITES> |
void | row_reduce_simple (const device T *in, device U *out, const constant size_t &reduction_size, const constant size_t &out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize, uint simd_lane_id, uint simd_per_group, uint simd_group_id) |
|
template<typename T , typename U , typename Op , int NDIMS = 0, int N_READS = REDUCE_N_READS> |
void | row_reduce_looped (const device T *in, device U *out, const constant size_t &row_size, const constant size_t &non_row_reductions, const constant int *shape, const constant size_t *strides, const constant int &ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &reduce_ndim, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize, uint simd_lane_id, uint simd_per_group, uint simd_group_id) |
|