build/html/backend_2metal_2kernels_2utils_8h_source.html

// Copyright © 2023-2024 Apple Inc.


#pragma once


#include <metal_math>


// The correct bf16.h is included based on the metal version

// by giving the correct path to -I during compilation

// e.g. mlx/backend/metal/kernels/metal_3_0/ for Metal 3.0

#include "bf16.h"


#include "mlx/backend/metal/kernels/bf16_math.h"

#include "mlx/backend/metal/kernels/complex.h"

#include "mlx/backend/metal/kernels/defines.h"


typedef half float16_t;


// Type limits utils


template <typename U>


struct Limits {

  static const constant U max = metal::numeric_limits<U>::max();

  static const constant U min = metal::numeric_limits<U>::min();

  static const constant U finite_max = metal::numeric_limits<U>::max();

  static const constant U finite_min = metal::numeric_limits<U>::min();

};


#define instantiate_default_limit(type)                                      \

  template <>                                                                \

  struct Limits<type> {                                                      \

    static constexpr constant type max = metal::numeric_limits<type>::max(); \

    static constexpr constant type min = metal::numeric_limits<type>::min(); \

    static constexpr constant type finite_max =                              \

        metal::numeric_limits<type>::max();                                  \

    static constexpr constant type finite_min =                              \

        metal::numeric_limits<type>::min();                                  \

  };


instantiate_default_limit(uint8_t);

instantiate_default_limit(uint16_t);

instantiate_default_limit(uint32_t);

instantiate_default_limit(uint64_t);

instantiate_default_limit(int8_t);

instantiate_default_limit(int16_t);

instantiate_default_limit(int32_t);

instantiate_default_limit(int64_t);


#define instantiate_float_limit(type)             \

  template <>                                     \

  struct Limits<type> {                           \

    static constexpr constant type max =          \

        metal::numeric_limits<type>::infinity();  \

    static constexpr constant type min =          \

        -metal::numeric_limits<type>::infinity(); \

    static constexpr constant type finite_max =   \

        metal::numeric_limits<type>::max();       \

    static constexpr constant type finite_min =   \

        -metal::numeric_limits<type>::max();      \

  };


instantiate_float_limit(half);

instantiate_float_limit(float);

instantiate_float_limit(bfloat16_t);


template <>


struct Limits<bool> {

  static constexpr constant bool max = true;

  static constexpr constant bool min = false;

};


template <>


struct Limits<complex64_t> {

  static constexpr constant complex64_t max = complex64_t(

      metal::numeric_limits<float>::infinity(),

      metal::numeric_limits<float>::infinity());

  static constexpr constant complex64_t min = complex64_t(

      -metal::numeric_limits<float>::infinity(),

      -metal::numeric_limits<float>::infinity());

};


// Indexing utils


#define MLX_MTL_PRAGMA_UNROLL _Pragma("clang loop unroll(full)")


// Single Array with generic dims


template <typename StrideT, typename IdxT = StrideT>


METAL_FUNC IdxT elem_to_loc(

    uint elem,

    constant const int* shape,

    constant const StrideT* strides,

    int ndim) {

  IdxT loc = 0;

  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {

    loc += (elem % shape[i]) * IdxT(strides[i]);

    elem /= shape[i];

  }

  return loc;

}


template <typename StrideT, typename IdxT = StrideT>


METAL_FUNC IdxT elem_to_loc(

    StrideT elem,

    constant const int* shape,

    constant const StrideT* strides,

    int ndim) {

  IdxT loc = 0;

  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {

    loc += (elem % shape[i]) * IdxT(strides[i]);

    elem /= shape[i];

  }

  return loc;

}


// Non templated version to handle arbitrary dims

template <typename StrideT, typename IdxT = StrideT>


METAL_FUNC IdxT elem_to_loc(

    uint3 elem,

    constant const int* shape,

    constant const StrideT* strides,

    int ndim) {

  IdxT loc =

      elem.x * IdxT(strides[ndim - 1]) + elem.y * IdxT(strides[ndim - 2]);

  for (int d = ndim - 3; d >= 0; --d) {

    loc += (elem.z % shape[d]) * IdxT(strides[d]);

    elem.z /= shape[d];

  }

  return loc;

}


// Single Array with fixed N dims


template <typename StrideT, typename IdxT = StrideT>


METAL_FUNC IdxT elem_to_loc_1(uint elem, constant const StrideT& stride) {

  return elem * IdxT(stride);

}


template <typename StrideT, typename IdxT = StrideT>


METAL_FUNC IdxT elem_to_loc_2(uint2 elem, constant const StrideT strides[2]) {

  return elem.x * IdxT(strides[1]) + elem.y * IdxT(strides[0]);

}


template <typename StrideT, typename IdxT = StrideT>


METAL_FUNC IdxT elem_to_loc_3(uint3 elem, constant const StrideT strides[3]) {

  return elem.x * IdxT(strides[2]) + elem.y * IdxT(strides[1]) +

      elem.z * IdxT(strides[0]);

}


// Multiple Arrays with generic dims


template <typename StrideT, typename IdxT = StrideT>


METAL_FUNC vec<IdxT, 2> elem_to_loc_2_nd(

    uint3 elem,

    constant const int* shape,

    constant const StrideT* a_strides,

    constant const StrideT* b_strides,

    int ndim) {

  vec<IdxT, 2> loc = {

      IdxT(

          elem.x * IdxT(a_strides[ndim - 1]) +

          IdxT(elem.y) * IdxT(a_strides[ndim - 2])),

      IdxT(

          elem.x * IdxT(b_strides[ndim - 1]) +

          elem.y * IdxT(b_strides[ndim - 2]))};

  for (int d = ndim - 3; d >= 0; --d) {

    uint l = elem.z % shape[d];

    loc.x += l * IdxT(a_strides[d]);

    loc.y += l * IdxT(b_strides[d]);

    elem.z /= shape[d];

  }

  return loc;

}


template <typename IdxT = size_t>


METAL_FUNC vec<IdxT, 3> elem_to_loc_3_nd(

    uint3 elem,

    constant const int* shape,

    constant const size_t* a_strides,

    constant const size_t* b_strides,

    constant const size_t* c_strides,

    int ndim) {

  vec<IdxT, 3> loc = {

      elem.x * IdxT(a_strides[ndim - 1]) + elem.y * IdxT(a_strides[ndim - 2]),

      elem.x * IdxT(b_strides[ndim - 1]) + elem.y * IdxT(b_strides[ndim - 2]),

      elem.x * IdxT(c_strides[ndim - 1]) + elem.y * IdxT(c_strides[ndim - 2])};

  for (int d = ndim - 3; d >= 0; --d) {

    uint l = elem.z % shape[d];

    loc.x += l * IdxT(a_strides[d]);

    loc.y += l * IdxT(b_strides[d]);

    loc.z += l * IdxT(c_strides[d]);

    elem.z /= shape[d];

  }

  return loc;

}


// Elem to loc in a loop utils


template <int DIM, typename OffsetT = size_t, bool General = true>


struct LoopedElemToLoc {

  int dim;

  LoopedElemToLoc<DIM - 1, OffsetT, General> inner_looper;

  OffsetT offset{0};

  int index{0};


  LoopedElemToLoc(int dim) : dim(dim), inner_looper(dim - 1) {}


  void next(const constant int* shape, const constant size_t* strides) {

    if (dim == 0) {

      return;

    }

    index++;

    offset += OffsetT(strides[dim - 1]);

    if (index >= shape[dim - 1]) {

      index = 0;

      inner_looper.next(shape, strides);

      offset = inner_looper.offset;

    }

  }


  void next(int n, const constant int* shape, const constant size_t* strides) {

    if (dim == 0) {

      return;

    }

    index += n;

    offset += n * OffsetT(strides[dim - 1]);


    if (index >= shape[dim - 1]) {

      int extra = index - shape[dim - 1];

      if (extra >= shape[dim - 1]) {

        inner_looper.next(1 + extra / shape[dim - 1], shape, strides);

        extra = extra % shape[dim - 1];

      } else {

        inner_looper.next(shape, strides);

      }

      index = 0;

      offset = inner_looper.offset;

      if (extra > 0) {

        next(extra, shape, strides);

      }

    }

  }


  OffsetT location() {

    return offset;

  }


};


template <typename OffsetT>


struct LoopedElemToLoc<1, OffsetT, true> {

  int dim;

  OffsetT offset{0};

  uint index{0};


  LoopedElemToLoc(int dim) : dim(dim) {}


  void next(const constant int* shape, const constant size_t* strides) {

    index++;

    if (dim > 1) {

      offset = elem_to_loc<size_t, OffsetT>(index, shape, strides, dim);

    } else {

      offset += OffsetT(strides[0]);

    }

  }


  void next(int n, const constant int* shape, const constant size_t* strides) {

    index += n;

    if (dim > 1) {

      offset = elem_to_loc<size_t, OffsetT>(index, shape, strides, dim);

    } else {

      offset = index * OffsetT(strides[0]);

    }

  }


  OffsetT location() {

    return offset;

  }


};


template <typename OffsetT>


struct LoopedElemToLoc<1, OffsetT, false> {

  OffsetT offset{0};


  LoopedElemToLoc(int) {}


  void next(const constant int*, const constant size_t* strides) {

    offset += OffsetT(strides[0]);

  }


  void next(int n, const constant int*, const constant size_t* strides) {

    offset += n * OffsetT(strides[0]);

  }


  OffsetT location() {

    return offset;

  }


};


// Calculation utils


template <typename T, typename U>


inline T ceildiv(T N, U M) {

  return (N + M - 1) / M;

}


// https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#1202


inline float log1p(float x) {

  float xp1 = 1.0f + x;

  if (xp1 == Limits<float>::max) {

    return Limits<float>::max;

  }

  if (xp1 == 1.0f) {

    return x;

  }


  return x * (metal::log(xp1) / (xp1 - 1.0f));

}


inline bfloat16_t log1p(bfloat16_t x) {

  float xp1 = 1.0f + static_cast<float>(x);

  if (xp1 == Limits<float>::max) {

    return Limits<bfloat16_t>::max;

  }

  if (xp1 == 1.0f) {

    return x;

  }


  return bfloat16_t(x * (metal::log(xp1) / (xp1 - 1.0f)));

}


// SIMD shuffle ops


inline uint64_t simd_shuffle_down(uint64_t data, uint16_t delta) {

  return as_type<uint64_t>(

      metal::simd_shuffle_down(as_type<uint2>(data), delta));

}


inline int64_t simd_shuffle_down(int64_t data, uint16_t delta) {

  return as_type<int64_t>(

      metal::simd_shuffle_down(as_type<uint2>(data), delta));

}


inline bool simd_shuffle_down(bool data, uint16_t delta) {

  return simd_shuffle_down(static_cast<uint32_t>(data), delta);

}


inline complex64_t simd_shuffle_down(complex64_t data, uint16_t delta) {

  return complex64_t(

      simd_shuffle_down(data.real, delta), simd_shuffle_down(data.imag, delta));

}


inline uint64_t simd_shuffle_up(uint64_t data, uint16_t delta) {

  return as_type<uint64_t>(metal::simd_shuffle_up(as_type<uint2>(data), delta));

}


inline int64_t simd_shuffle_up(int64_t data, uint16_t delta) {

  return as_type<int64_t>(metal::simd_shuffle_up(as_type<uint2>(data), delta));

}


inline bool simd_shuffle_up(bool data, uint16_t delta) {

  return simd_shuffle_up(static_cast<uint32_t>(data), delta);

}


inline complex64_t simd_shuffle_up(complex64_t data, uint16_t delta) {

  return complex64_t(

      simd_shuffle_up(data.real, delta), simd_shuffle_up(data.imag, delta));

}


inline uint64_t


simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta) {

  return as_type<uint64_t>(metal::simd_shuffle_and_fill_up(

      as_type<uint2>(data), as_type<uint2>(filling), delta));

}


inline int64_t


simd_shuffle_and_fill_up(int64_t data, int64_t filling, uint16_t delta) {

  return as_type<int64_t>(metal::simd_shuffle_and_fill_up(

      as_type<uint2>(data), as_type<uint2>(filling), delta));

}


inline bool simd_shuffle_and_fill_up(bool data, bool filling, uint16_t delta) {

  return simd_shuffle_and_fill_up(

      static_cast<uint32_t>(data), static_cast<uint32_t>(filling), delta);

}


inline complex64_t simd_shuffle_and_fill_up(

    complex64_t data,

    complex64_t filling,

    uint16_t delta) {

  return complex64_t(

      simd_shuffle_and_fill_up(data.real, filling.real, delta),

      simd_shuffle_and_fill_up(data.imag, filling.imag, delta));

}


inline uint64_t simd_shuffle(uint64_t data, uint16_t lane) {

  return as_type<uint64_t>(metal::simd_shuffle(as_type<uint2>(data), lane));

}


inline int64_t simd_shuffle(int64_t data, uint16_t lane) {

  return as_type<int64_t>(metal::simd_shuffle(as_type<uint2>(data), lane));

}


inline bool simd_shuffle(bool data, uint16_t lane) {

  return simd_shuffle(static_cast<uint32_t>(data), lane);

}


inline complex64_t simd_shuffle(complex64_t data, uint16_t lane) {

  return complex64_t(

      simd_shuffle(data.real, lane), simd_shuffle(data.imag, lane));

}


// std::conditional is not included with Metal

template <bool condition, typename T, typename U>


struct ConditionalType {

  using type = U;

};


template <typename T, typename U>


struct ConditionalType<true, T, U> {

  using type = T;

};


next
BufferHolder * next
Definition allocator.h:38

complex.h

bfloat16_t
struct _MLX_BFloat16 bfloat16_t
Definition bf16.h:251

instantiate_float_limit
#define instantiate_float_limit(type)
Definition utils.h:50

elem_to_loc
METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)
Definition utils.h:93

log1p
float log1p(float x)
Definition utils.h:318

elem_to_loc_2
METAL_FUNC IdxT elem_to_loc_2(uint2 elem, constant const StrideT strides[2])
Definition utils.h:145

elem_to_loc_3
METAL_FUNC IdxT elem_to_loc_3(uint3 elem, constant const StrideT strides[3])
Definition utils.h:150

elem_to_loc_3_nd
METAL_FUNC vec< IdxT, 3 > elem_to_loc_3_nd(uint3 elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, int ndim)
Definition utils.h:182

elem_to_loc_2_nd
METAL_FUNC vec< IdxT, 2 > elem_to_loc_2_nd(uint3 elem, constant const int *shape, constant const StrideT *a_strides, constant const StrideT *b_strides, int ndim)
Definition utils.h:159

ceildiv
T ceildiv(T N, U M)
Compute ceil((float)N/(float)M)
Definition utils.h:313

instantiate_default_limit
#define instantiate_default_limit(type)
Definition utils.h:30

elem_to_loc_1
METAL_FUNC IdxT elem_to_loc_1(uint elem, constant const StrideT &stride)
Definition utils.h:140

float16_t
half float16_t
Definition utils.h:16

bf16_math.h

defines.h

metal::simd_shuffle_and_fill_up
METAL_FUNC bfloat16_t simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)
Definition bf16_math.h:377

metal::simd_shuffle
METAL_FUNC bfloat16_t simd_shuffle(bfloat16_t data, ushort simd_lane_id)
Definition bf16_math.h:377

metal::log
METAL_FUNC bfloat16_t log(bfloat16_t x)
Definition bf16_math.h:232

metal::simd_shuffle_down
METAL_FUNC bfloat16_t simd_shuffle_down(bfloat16_t data, ushort delta)
Definition bf16_math.h:377

metal::simd_shuffle_up
METAL_FUNC bfloat16_t simd_shuffle_up(bfloat16_t data, ushort delta)
Definition bf16_math.h:377

_MLX_BFloat16
Definition bf16.h:48

ConditionalType< true, T, U >::type
T type
Definition utils.h:433

ConditionalType
Definition utils.h:427

ConditionalType::type
U type
Definition utils.h:428

Limits
Definition utils.h:23

Limits::max
static const constant U max
Definition utils.h:24

Limits::finite_max
static const constant U finite_max
Definition utils.h:26

Limits::min
static const constant U min
Definition utils.h:25

Limits::finite_min
static const constant U finite_min
Definition utils.h:27

LoopedElemToLoc< 1, OffsetT, false >::next
void next(const constant int *, const constant size_t *strides)
Definition utils.h:294

LoopedElemToLoc< 1, OffsetT, false >::LoopedElemToLoc
LoopedElemToLoc(int)
Definition utils.h:292

LoopedElemToLoc< 1, OffsetT, false >::location
OffsetT location()
Definition utils.h:302

LoopedElemToLoc< 1, OffsetT, false >::next
void next(int n, const constant int *, const constant size_t *strides)
Definition utils.h:298

LoopedElemToLoc< 1, OffsetT, true >::location
OffsetT location()
Definition utils.h:283

LoopedElemToLoc< 1, OffsetT, true >::dim
int dim
Definition utils.h:259

LoopedElemToLoc< 1, OffsetT, true >::next
void next(int n, const constant int *shape, const constant size_t *strides)
Definition utils.h:274

LoopedElemToLoc< 1, OffsetT, true >::LoopedElemToLoc
LoopedElemToLoc(int dim)
Definition utils.h:263

LoopedElemToLoc< 1, OffsetT, true >::next
void next(const constant int *shape, const constant size_t *strides)
Definition utils.h:265

LoopedElemToLoc
Definition utils.h:208

LoopedElemToLoc::next
void next(const constant int *shape, const constant size_t *strides)
Definition utils.h:216

LoopedElemToLoc::LoopedElemToLoc
LoopedElemToLoc(int dim)
Definition utils.h:214

LoopedElemToLoc::next
void next(int n, const constant int *shape, const constant size_t *strides)
Definition utils.h:229

LoopedElemToLoc::inner_looper
LoopedElemToLoc< DIM - 1, OffsetT, General > inner_looper
Definition utils.h:210

LoopedElemToLoc::location
OffsetT location()
Definition utils.h:252

LoopedElemToLoc::index
int index
Definition utils.h:212

LoopedElemToLoc::offset
OffsetT offset
Definition utils.h:211

LoopedElemToLoc::dim
int dim
Definition utils.h:209

complex64_t
Definition complex.h:20

complex64_t::imag
float imag
Definition complex.h:22

complex64_t::real
float real
Definition complex.h:21