|
template<typename T > |
METAL_FUNC void | thread_swap (thread T &a, thread T &b) |
|
template<typename T , typename U , bool ARG_SORT, short BLOCK_THREADS, short N_PER_THREAD> |
void | block_sort (const device T *inp, device U *out, const constant int &size_sorted_axis, const constant int &stride_sorted_axis, const constant int &stride_segment_axis, uint3 tid, uint3 lid) |
|
template<typename T , typename U , bool ARG_SORT, short BLOCK_THREADS, short N_PER_THREAD> |
void | block_sort_nc (const device T *inp, device U *out, const constant int &size_sorted_axis, const constant int &stride_sorted_axis, const constant int &nc_dim, const device int *nc_shape, const device size_t *nc_strides, uint3 tid, uint3 lid) |
|
template<typename val_t , typename idx_t , bool ARG_SORT, short BLOCK_THREADS, short N_PER_THREAD> |
void | mb_block_sort (const device val_t *inp, device val_t *out_vals, device idx_t *out_idxs, const constant int &size_sorted_axis, const constant int &stride_sorted_axis, const constant int &nc_dim, const device int *nc_shape, const device size_t *nc_strides, uint3 tid, uint3 lid) |
|
template<typename val_t , typename idx_t , bool ARG_SORT, short BLOCK_THREADS, short N_PER_THREAD> |
void | mb_block_partition (device idx_t *block_partitions, const device val_t *dev_vals, const device idx_t *dev_idxs, const constant int &size_sorted_axis, const constant int &merge_tiles, uint3 tid, uint3 lid, uint3 tgp_dims) |
|
template<typename val_t , typename idx_t , bool ARG_SORT, short BLOCK_THREADS, short N_PER_THREAD, typename CompareOp = LessThan<val_t>> |
void | mb_block_merge (const device idx_t *block_partitions, const device val_t *dev_vals_in, const device idx_t *dev_idxs_in, device val_t *dev_vals_out, device idx_t *dev_idxs_out, const constant int &size_sorted_axis, const constant int &merge_tiles, const constant int &num_tiles, uint3 tid, uint3 lid) |
|