Go to the source code of this file.
 | 
| template<typename T , typename U , typename Op , int NDIMS, int N_READS = REDUCE_N_READS>  | 
| void  | col_reduce_small (const device T *in, device U *out, const constant size_t &reduction_size, const constant size_t &reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &reduce_ndim, const constant size_t &non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 tsize) | 
|   | 
| template<typename T , typename U , typename Op , int NDIMS, int BM, int BN>  | 
| void  | col_reduce_looped (const device T *in, device U *out, const constant size_t &reduction_size, const constant size_t &reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &reduce_ndim, const constant size_t &non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id) | 
|   | Our approach is the following simple looped approach:  
  | 
|   | 
◆ col_reduce_looped()
template<typename T , typename U , typename Op , int NDIMS, int BM, int BN> 
      
        
          | void col_reduce_looped  | 
          ( | 
          const device T * |           in,  | 
        
        
           | 
           | 
          device U * |           out,  | 
        
        
           | 
           | 
          const constant size_t & |           reduction_size,  | 
        
        
           | 
           | 
          const constant size_t & |           reduction_stride,  | 
        
        
           | 
           | 
          const constant int * |           shape,  | 
        
        
           | 
           | 
          const constant size_t * |           strides,  | 
        
        
           | 
           | 
          const constant int & |           ndim,  | 
        
        
           | 
           | 
          const constant int * |           reduce_shape,  | 
        
        
           | 
           | 
          const constant size_t * |           reduce_strides,  | 
        
        
           | 
           | 
          const constant int & |           reduce_ndim,  | 
        
        
           | 
           | 
          const constant size_t & |           non_col_reductions,  | 
        
        
           | 
           | 
          uint3 |           gid,  | 
        
        
           | 
           | 
          uint3 |           gsize,  | 
        
        
           | 
           | 
          uint |           simd_lane_id,  | 
        
        
           | 
           | 
          uint |           simd_group_id ) | 
        
      
 
Our approach is the following simple looped approach: 
- Each thread keeps running totals for BN / n_simdgroups outputs.
 
- Load a tile BM, BN in registers and accumulate in the running totals
 
- Move ahead by BM steps until the column axis and the non column reductions are exhausted.
 
- If BM == 32 then transpose in SM and simd reduce the running totals. Otherwise write in shared memory and BN threads accumulate the running totals with a loop.
 
- Write them to the output 
 
 
 
◆ col_reduce_small()
template<typename T , typename U , typename Op , int NDIMS, int N_READS = REDUCE_N_READS> 
      
        
          | void col_reduce_small  | 
          ( | 
          const device T * |           in,  | 
        
        
           | 
           | 
          device U * |           out,  | 
        
        
           | 
           | 
          const constant size_t & |           reduction_size,  | 
        
        
           | 
           | 
          const constant size_t & |           reduction_stride,  | 
        
        
           | 
           | 
          const constant int * |           shape,  | 
        
        
           | 
           | 
          const constant size_t * |           strides,  | 
        
        
           | 
           | 
          const constant int & |           ndim,  | 
        
        
           | 
           | 
          const constant int * |           reduce_shape,  | 
        
        
           | 
           | 
          const constant size_t * |           reduce_strides,  | 
        
        
           | 
           | 
          const constant int & |           reduce_ndim,  | 
        
        
           | 
           | 
          const constant size_t & |           non_col_reductions,  | 
        
        
           | 
           | 
          uint3 |           gid,  | 
        
        
           | 
           | 
          uint3 |           gsize,  | 
        
        
           | 
           | 
          uint |           simd_lane_id,  | 
        
        
           | 
           | 
          uint |           simd_group_id,  | 
        
        
           | 
           | 
          uint3 |           tid,  | 
        
        
           | 
           | 
          uint3 |           tsize ) |