build/html/attn_2loader_8h_source.html

// Copyright © 2024 Apple Inc.


#pragma once


#include "mlx/backend/metal/kernels/steel/defines.h"


// Loading helper


namespace mlx {

namespace steel {


template <

    typename T,

    short BROWS,

    short BCOLS,

    short dst_ld,

    short reduction_dim,

    short tgp_size,

    short alignment = 1,

    short n_reads = (BCOLS * BROWS) / (tgp_size),

    short TCOLS = BCOLS / n_reads,

    short TROWS = tgp_size / TCOLS>


struct BlockLoader {

  STEEL_CONST short n_rows = (BROWS + TROWS - 1) / TROWS;

  STEEL_CONST short vec_size = n_reads;


  // Leading dimension for src

  const int src_ld;

  const int tile_stride;


  // Thread location indices

  const short thread_idx;

  const short bi;

  const short bj;


  // threadgroup and device memory

  threadgroup T* dst;

  const device T* src;


  struct alignas(alignment * sizeof(T)) ReadVector {

    uint8_t v[sizeof(T) * vec_size];

  };


  /* Constructor */


  METAL_FUNC BlockLoader(

      const device T* src_,

      const int src_ld_,

      threadgroup T* dst_,

      ushort simd_group_id [[simdgroup_index_in_threadgroup]],

      ushort simd_lane_id [[thread_index_in_simdgroup]])

      : src_ld(src_ld_),

        tile_stride(reduction_dim ? BCOLS : BROWS * src_ld),

        thread_idx(simd_group_id * 32 + simd_lane_id),

        bi(thread_idx / TCOLS),

        bj(vec_size * (thread_idx % TCOLS)),

        dst(dst_ + bi * dst_ld + bj),

        src(src_ + bi * src_ld + bj) {}


  /* Apply operation to threadgroup without bound checking */

  template <typename UnaryOp>


  METAL_FUNC void apply_inplace_op(thread const UnaryOp& op) const {

    STEEL_PRAGMA_UNROLL

    for (short i = 0; i < BROWS; i += TROWS) {

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        dst[i * dst_ld + j] = op.apply(dst[i * dst_ld + j]);

      }

    }

  }


  /* Load from device memory into threadgroup memory - without bound checking */


  METAL_FUNC void load_unsafe() const {

    STEEL_PRAGMA_UNROLL

    for (short i = 0; i < BROWS; i += TROWS) {

      *((threadgroup ReadVector*)(&dst[i * dst_ld])) =

          *((const device ReadVector*)(&src[i * src_ld]));

    }

  }


  /* Load from device memory into threadgroup memory - with bound checking */


  METAL_FUNC void load_safe(short2 src_tile_dim) const {

    src_tile_dim = src_tile_dim - short2(bj, bi);


    // Skip loading if thread has no valid reads

    if (src_tile_dim.x <= 0 || src_tile_dim.y <= 0) {

      STEEL_PRAGMA_UNROLL

      for (short i = 0; i < BROWS; i += TROWS) {

        STEEL_PRAGMA_UNROLL

        for (short j = 0; j < vec_size; j++) {

          dst[i * dst_ld + j] = T(0);

        }

      }

      return;

    }


    // Use fast thread memory for bound checks

    bool tmp_idx[vec_size];

    T tmp_val[vec_size];


    STEEL_PRAGMA_UNROLL

    for (short i = 0; i < BROWS; i += TROWS) {

      // Make sure tmp_idx only contains valid indices

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        tmp_idx[j] = (i < src_tile_dim.y) && (j < src_tile_dim.x);

      }


      // Read valid indices into tmp_val

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        tmp_val[j] = src[(tmp_idx[j] ? i * src_ld + j : 0)];

      }


      // Zero out uneeded values

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);

      }


      // Copy values to threadgroup memory

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        dst[i * dst_ld + j] = tmp_val[j];

      }

    }

  }


  /* Iteration helper */


  METAL_FUNC void next() {

    src += tile_stride;

  }


};


template <int R, int C>


struct CShape {

  STEEL_CONST int kRows = R;

  STEEL_CONST int kCols = C;

};


template <

    typename T,

    short BROWS,

    short BCOLS,

    short kDstStrRow,

    short kDstStrCol,

    short reduction_dim,

    short tgp_size,

    short n_reads = (BCOLS * BROWS) / (tgp_size),

    short TCOLS = BCOLS / n_reads,

    short TROWS = tgp_size / TCOLS>


struct BlockLoaderT {

  STEEL_CONST short n_rows = (BROWS + TROWS - 1) / TROWS;

  STEEL_CONST short vec_size = n_reads;


  // Leading dimension for src

  const int src_ld;

  const int tile_stride;


  // Thread location indices

  const short thread_idx;

  const short bi;

  const short bj;


  // threadgroup and device memory

  threadgroup T* dst;

  const device T* src;


  /* Constructor */


  METAL_FUNC BlockLoaderT(

      const device T* src_,

      const int src_ld_,

      threadgroup T* dst_,

      ushort simd_group_id [[simdgroup_index_in_threadgroup]],

      ushort simd_lane_id [[thread_index_in_simdgroup]])

      : src_ld(src_ld_),

        tile_stride(reduction_dim ? BCOLS : BROWS * src_ld),

        thread_idx(simd_group_id * 32 + simd_lane_id),

        bi(thread_idx / TCOLS),

        bj(vec_size * (thread_idx % TCOLS)),

        dst(dst_ + bi * kDstStrRow + bj * kDstStrCol),

        src(src_ + bi * src_ld + bj) {}


  /* Apply operation to threadgroup without bound checking */

  template <typename UnaryOp>


  METAL_FUNC void apply_inplace_op(thread const UnaryOp& op) const {

    STEEL_PRAGMA_UNROLL

    for (short i = 0; i < BROWS; i += TROWS) {

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        dst[i * kDstStrRow + j * kDstStrCol] =

            op.apply(dst[i * kDstStrRow + j * kDstStrCol]);

      }

    }

  }


  /* Load from device memory into threadgroup memory - without bound checking */


  METAL_FUNC void load_unsafe() const {

    STEEL_PRAGMA_UNROLL

    for (short i = 0; i < BROWS; i += TROWS) {

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        dst[i * kDstStrRow + j * kDstStrCol] = src[i * src_ld + j];

      }

    }

  }


  /* Load from device memory into threadgroup memory - with bound checking */


  METAL_FUNC void load_safe(short2 src_tile_dim) const {

    src_tile_dim = src_tile_dim - short2(bj, bi);


    // Skip loading if thread has no valid reads

    if (src_tile_dim.x <= 0 || src_tile_dim.y <= 0) {

      STEEL_PRAGMA_UNROLL

      for (short i = 0; i < BROWS; i += TROWS) {

        STEEL_PRAGMA_UNROLL

        for (short j = 0; j < vec_size; j++) {

          dst[i * kDstStrRow + j * kDstStrCol] = T(0);

        }

      }

      return;

    }


    // Use fast thread memory for bound checks

    bool tmp_idx[vec_size];

    T tmp_val[vec_size];


    STEEL_PRAGMA_UNROLL

    for (short i = 0; i < BROWS; i += TROWS) {

      // Make sure tmp_idx only contains valid indices

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        tmp_idx[j] = (i < src_tile_dim.y) && (j < src_tile_dim.x);

      }


      // Read valid indices into tmp_val

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        tmp_val[j] = src[(tmp_idx[j] ? i * src_ld + j : 0)];

      }


      // Zero out uneeded values

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);

      }


      // Copy values to threadgroup memory

      STEEL_PRAGMA_UNROLL

      for (short j = 0; j < vec_size; j++) {

        dst[i * kDstStrRow + j * kDstStrCol] = tmp_val[j];

      }

    }

  }


  /* Iteration helper */


  METAL_FUNC void next() {

    src += tile_stride;

  }


};


} // namespace steel

} // namespace mlx

mlx::steel
Definition attn.h:19

mlx
Definition allocator.h:7

defines.h

STEEL_PRAGMA_UNROLL
#define STEEL_PRAGMA_UNROLL
Definition defines.h:4

STEEL_CONST
#define STEEL_CONST
Definition defines.h:3

mlx::steel::BlockLoader::ReadVector
Definition loader.h:42

mlx::steel::BlockLoader::ReadVector::v
uint8_t v[sizeof(T) *vec_size]
Definition loader.h:43

mlx::steel::BlockLoader< T, transpose_a ? BK :BM, transpose_a ? BM :BK, transpose_a ? BM+tgp_padding_a :BK+tgp_padding_a, !transpose_a, tgp_size >::thread_idx
const short thread_idx
Definition loader.h:34

mlx::steel::BlockLoader::BlockLoader
METAL_FUNC BlockLoader(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)
Definition loader.h:47

mlx::steel::BlockLoader< T, transpose_a ? BK :BM, transpose_a ? BM :BK, transpose_a ? BM+tgp_padding_a :BK+tgp_padding_a, !transpose_a, tgp_size >::vec_size
STEEL_CONST short vec_size
Definition loader.h:27

mlx::steel::BlockLoader::next
METAL_FUNC void next()
Definition loader.h:131

mlx::steel::BlockLoader::load_unsafe
METAL_FUNC void load_unsafe() const
Definition loader.h:74

mlx::steel::BlockLoader< T, transpose_a ? BK :BM, transpose_a ? BM :BK, transpose_a ? BM+tgp_padding_a :BK+tgp_padding_a, !transpose_a, tgp_size >::bj
const short bj
Definition loader.h:36

mlx::steel::BlockLoader< T, transpose_a ? BK :BM, transpose_a ? BM :BK, transpose_a ? BM+tgp_padding_a :BK+tgp_padding_a, !transpose_a, tgp_size >::n_rows
STEEL_CONST short n_rows
Definition loader.h:26

mlx::steel::BlockLoader< T, transpose_a ? BK :BM, transpose_a ? BM :BK, transpose_a ? BM+tgp_padding_a :BK+tgp_padding_a, !transpose_a, tgp_size >::bi
const short bi
Definition loader.h:35

mlx::steel::BlockLoader< T, transpose_a ? BK :BM, transpose_a ? BM :BK, transpose_a ? BM+tgp_padding_a :BK+tgp_padding_a, !transpose_a, tgp_size >::src_ld
const int src_ld
Definition loader.h:30

mlx::steel::BlockLoader< T, transpose_a ? BK :BM, transpose_a ? BM :BK, transpose_a ? BM+tgp_padding_a :BK+tgp_padding_a, !transpose_a, tgp_size >::tile_stride
const int tile_stride
Definition loader.h:31

mlx::steel::BlockLoader::load_safe
METAL_FUNC void load_safe(short2 src_tile_dim) const
Definition loader.h:83

mlx::steel::BlockLoader< T, transpose_a ? BK :BM, transpose_a ? BM :BK, transpose_a ? BM+tgp_padding_a :BK+tgp_padding_a, !transpose_a, tgp_size >::src
const device T * src
Definition loader.h:40

mlx::steel::BlockLoader::apply_inplace_op
METAL_FUNC void apply_inplace_op(thread const UnaryOp &op) const
Definition loader.h:63

mlx::steel::BlockLoader< T, transpose_a ? BK :BM, transpose_a ? BM :BK, transpose_a ? BM+tgp_padding_a :BK+tgp_padding_a, !transpose_a, tgp_size >::dst
threadgroup T * dst
Definition loader.h:39

mlx::steel::BlockLoaderT::BlockLoaderT
METAL_FUNC BlockLoaderT(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)
Definition loader.h:171

mlx::steel::BlockLoaderT::n_rows
STEEL_CONST short n_rows
Definition loader.h:154

mlx::steel::BlockLoaderT::apply_inplace_op
METAL_FUNC void apply_inplace_op(thread const UnaryOp &op) const
Definition loader.h:187

mlx::steel::BlockLoaderT::tile_stride
const int tile_stride
Definition loader.h:159

mlx::steel::BlockLoaderT::next
METAL_FUNC void next()
Definition loader.h:258

mlx::steel::BlockLoaderT::bi
const short bi
Definition loader.h:163

mlx::steel::BlockLoaderT::dst
threadgroup T * dst
Definition loader.h:167

mlx::steel::BlockLoaderT::src
const device T * src
Definition loader.h:168

mlx::steel::BlockLoaderT::vec_size
STEEL_CONST short vec_size
Definition loader.h:155

mlx::steel::BlockLoaderT::load_safe
METAL_FUNC void load_safe(short2 src_tile_dim) const
Definition loader.h:210

mlx::steel::BlockLoaderT::bj
const short bj
Definition loader.h:164

mlx::steel::BlockLoaderT::load_unsafe
METAL_FUNC void load_unsafe() const
Definition loader.h:199

mlx::steel::BlockLoaderT::src_ld
const int src_ld
Definition loader.h:158

mlx::steel::BlockLoaderT::thread_idx
const short thread_idx
Definition loader.h:162

mlx::steel::CShape
Definition loader.h:137

mlx::steel::CShape::kCols
STEEL_CONST int kCols
Definition loader.h:139

mlx::steel::CShape::kRows
STEEL_CONST int kRows
Definition loader.h:138