build/html/sort_8h_source.html

// Copyright © 2023-2024 Apple Inc.


#define MLX_MTL_CONST static constant constexpr const

#define MLX_MTL_LOOP_UNROLL _Pragma("clang loop unroll(full)")


using namespace metal;


// Based on GPU merge sort algorithm at

// https://github.com/NVIDIA/cccl/tree/main/cub/cub


// Thread-level sort


template <typename T>


METAL_FUNC void thread_swap(thread T& a, thread T& b) {

  T w = a;

  a = b;

  b = w;

}


template <typename T>


struct LessThan {

  static constexpr constant T init = Limits<T>::max;


  METAL_FUNC bool operator()(T a, T b) {

    return a < b;

  }


};


template <

    typename ValT,

    typename IdxT,

    bool ARG_SORT,

    short N_PER_THREAD,

    typename CompareOp>


struct ThreadSort {


  static METAL_FUNC void sort(

      thread ValT (&vals)[N_PER_THREAD],

      thread IdxT (&idxs)[N_PER_THREAD]) {

    CompareOp op;

    MLX_MTL_LOOP_UNROLL

    for (short i = 0; i < N_PER_THREAD; ++i) {

      MLX_MTL_LOOP_UNROLL

      for (short j = i & 1; j < N_PER_THREAD - 1; j += 2) {

        if (op(vals[j + 1], vals[j])) {

          thread_swap(vals[j + 1], vals[j]);

          thread_swap(idxs[j + 1], idxs[j]);

        }

      }

    }

  }


};


// Threadgroup-level sort


template <

    typename ValT,

    typename IdxT,

    bool ARG_SORT,

    short BLOCK_THREADS,

    short N_PER_THREAD,

    typename CompareOp>


struct BlockMergeSort {

  using thread_sort_t =

      ThreadSort<ValT, IdxT, ARG_SORT, N_PER_THREAD, CompareOp>;


  static METAL_FUNC int merge_partition(

      const threadgroup ValT* As,

      const threadgroup ValT* Bs,

      short A_sz,

      short B_sz,

      short sort_md) {

    CompareOp op;


    short A_st = max(0, sort_md - B_sz);

    short A_ed = min(sort_md, A_sz);


    while (A_st < A_ed) {

      short md = A_st + (A_ed - A_st) / 2;

      auto a = As[md];

      auto b = Bs[sort_md - 1 - md];


      if (op(b, a)) {

        A_ed = md;

      } else {

        A_st = md + 1;

      }

    }


    return A_ed;

  }


  static METAL_FUNC void merge_step(

      const threadgroup ValT* As,

      const threadgroup ValT* Bs,

      const threadgroup IdxT* As_idx,

      const threadgroup IdxT* Bs_idx,

      short A_sz,

      short B_sz,

      thread ValT (&vals)[N_PER_THREAD],

      thread IdxT (&idxs)[N_PER_THREAD]) {

    CompareOp op;

    short a_idx = 0;

    short b_idx = 0;


    for (int i = 0; i < N_PER_THREAD; ++i) {

      auto a = As[a_idx];

      auto b = Bs[b_idx];

      bool pred = (b_idx < B_sz) && (a_idx >= A_sz || op(b, a));


      vals[i] = pred ? b : a;

      idxs[i] = pred ? Bs_idx[b_idx] : As_idx[a_idx];


      b_idx += short(pred);

      a_idx += short(!pred);

    }

  }


  static METAL_FUNC void sort(

      threadgroup ValT* tgp_vals [[threadgroup(0)]],

      threadgroup IdxT* tgp_idxs [[threadgroup(1)]],

      int size_sorted_axis,

      uint3 lid [[thread_position_in_threadgroup]]) {

    // Get thread location

    int idx = lid.x * N_PER_THREAD;


    // Load from shared memory

    thread ValT thread_vals[N_PER_THREAD];

    thread IdxT thread_idxs[N_PER_THREAD];

    for (int i = 0; i < N_PER_THREAD; ++i) {

      thread_vals[i] = tgp_vals[idx + i];

      if (ARG_SORT) {

        thread_idxs[i] = tgp_idxs[idx + i];

      }

    }


    // Per thread sort

    if (idx < size_sorted_axis) {

      thread_sort_t::sort(thread_vals, thread_idxs);

    }


    // Do merges using threadgroup memory

    for (int merge_threads = 2; merge_threads <= BLOCK_THREADS;

         merge_threads *= 2) {

      // Update threadgroup memory

      threadgroup_barrier(mem_flags::mem_threadgroup);

      for (int i = 0; i < N_PER_THREAD; ++i) {

        tgp_vals[idx + i] = thread_vals[i];

        if (ARG_SORT) {

          tgp_idxs[idx + i] = thread_idxs[i];

        }

      }

      threadgroup_barrier(mem_flags::mem_threadgroup);


      // Find location in merge step

      int merge_group = lid.x / merge_threads;

      int merge_lane = lid.x % merge_threads;


      int sort_sz = N_PER_THREAD * merge_threads;

      int sort_st = N_PER_THREAD * merge_threads * merge_group;


      // As = tgp_vals[A_st:A_ed] is sorted

      // Bs = tgp_vals[B_st:B_ed] is sorted

      int A_st = sort_st;

      int A_ed = sort_st + sort_sz / 2;

      int B_st = sort_st + sort_sz / 2;

      int B_ed = sort_st + sort_sz;


      const threadgroup ValT* As = tgp_vals + A_st;

      const threadgroup ValT* Bs = tgp_vals + B_st;

      int A_sz = A_ed - A_st;

      int B_sz = B_ed - B_st;


      // Find a partition of merge elements

      //  Ci = merge(As[partition:], Bs[sort_md - partition:])

      //       of size N_PER_THREAD for each merge lane i

      //  C = [Ci] is sorted

      int sort_md = N_PER_THREAD * merge_lane;

      int partition = merge_partition(As, Bs, A_sz, B_sz, sort_md);


      As += partition;

      Bs += sort_md - partition;


      A_sz -= partition;

      B_sz -= sort_md - partition;


      const threadgroup IdxT* As_idx =

          ARG_SORT ? tgp_idxs + A_st + partition : nullptr;

      const threadgroup IdxT* Bs_idx =

          ARG_SORT ? tgp_idxs + B_st + sort_md - partition : nullptr;


      // Merge starting at the partition and store results in thread registers

      merge_step(As, Bs, As_idx, Bs_idx, A_sz, B_sz, thread_vals, thread_idxs);

    }


    // Write out to shared memory

    threadgroup_barrier(mem_flags::mem_threadgroup);

    for (int i = 0; i < N_PER_THREAD; ++i) {

      tgp_vals[idx + i] = thread_vals[i];

      if (ARG_SORT) {

        tgp_idxs[idx + i] = thread_idxs[i];

      }

    }

  }


};


// Kernel sort


template <

    typename T,

    typename U,

    bool ARG_SORT,

    short BLOCK_THREADS,

    short N_PER_THREAD,

    typename CompareOp = LessThan<T>>


struct KernelMergeSort {

  using ValT = T;

  using IdxT = uint;

  using block_merge_sort_t = BlockMergeSort<

      ValT,

      IdxT,

      ARG_SORT,

      BLOCK_THREADS,

      N_PER_THREAD,

      CompareOp>;


  MLX_MTL_CONST short N_PER_BLOCK = BLOCK_THREADS * N_PER_THREAD;


  static METAL_FUNC void block_sort(

      const device T* inp,

      device U* out,

      const constant int& size_sorted_axis,

      const constant int& in_stride_sorted_axis,

      const constant int& out_stride_sorted_axis,

      const constant int& in_stride_segment_axis,

      const constant int& out_stride_segment_axis,

      threadgroup ValT* tgp_vals,

      threadgroup IdxT* tgp_idxs,

      uint3 tid [[threadgroup_position_in_grid]],

      uint3 lid [[thread_position_in_threadgroup]]) {

    // tid.y tells us the segment index

    inp += tid.y * in_stride_segment_axis;

    out += tid.y * out_stride_segment_axis;


    // Copy into threadgroup memory

    for (short i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {

      tgp_vals[i] = i < size_sorted_axis ? inp[i * in_stride_sorted_axis]

                                         : ValT(CompareOp::init);

      if (ARG_SORT) {

        tgp_idxs[i] = i;

      }

    }


    // Sort elements within the block

    threadgroup_barrier(mem_flags::mem_threadgroup);


    block_merge_sort_t::sort(tgp_vals, tgp_idxs, size_sorted_axis, lid);


    threadgroup_barrier(mem_flags::mem_threadgroup);


    // Write output

    for (int i = lid.x; i < size_sorted_axis; i += BLOCK_THREADS) {

      if (ARG_SORT) {

        out[i * out_stride_sorted_axis] = tgp_idxs[i];

      } else {

        out[i * out_stride_sorted_axis] = tgp_vals[i];

      }

    }

  }


};


template <

    typename T,

    typename U,

    bool ARG_SORT,

    short BLOCK_THREADS,

    short N_PER_THREAD>


[[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void block_sort(

    const device T* inp [[buffer(0)]],

    device U* out [[buffer(1)]],

    const constant int& size_sorted_axis [[buffer(2)]],

    const constant int& in_stride_sorted_axis [[buffer(3)]],

    const constant int& out_stride_sorted_axis [[buffer(4)]],

    const constant int& in_stride_segment_axis [[buffer(5)]],

    const constant int& out_stride_segment_axis [[buffer(6)]],

    uint3 tid [[threadgroup_position_in_grid]],

    uint3 lid [[thread_position_in_threadgroup]]) {

  using sort_kernel =

      KernelMergeSort<T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD>;

  using ValT = typename sort_kernel::ValT;

  using IdxT = typename sort_kernel::IdxT;


  if (ARG_SORT) {

    threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];

    threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];

    sort_kernel::block_sort(

        inp,

        out,

        size_sorted_axis,

        in_stride_sorted_axis,

        out_stride_sorted_axis,

        in_stride_segment_axis,

        out_stride_segment_axis,

        tgp_vals,

        tgp_idxs,

        tid,

        lid);

  } else {

    threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];

    sort_kernel::block_sort(

        inp,

        out,

        size_sorted_axis,

        in_stride_sorted_axis,

        out_stride_sorted_axis,

        in_stride_segment_axis,

        out_stride_segment_axis,

        tgp_vals,

        nullptr,

        tid,

        lid);

  }

}


constant constexpr const int zero_helper = 0;


template <

    typename T,

    typename U,

    bool ARG_SORT,

    short BLOCK_THREADS,

    short N_PER_THREAD>


[[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void block_sort_nc(

    const device T* inp [[buffer(0)]],

    device U* out [[buffer(1)]],

    const constant int& size_sorted_axis [[buffer(2)]],

    const constant int& in_stride_sorted_axis [[buffer(3)]],

    const constant int& out_stride_sorted_axis [[buffer(4)]],

    const constant int& nc_dim [[buffer(5)]],

    const constant int* nc_shape [[buffer(6)]],

    const constant int64_t* in_nc_strides [[buffer(7)]],

    const constant int64_t* out_nc_strides [[buffer(8)]],

    uint3 tid [[threadgroup_position_in_grid]],

    uint3 lid [[thread_position_in_threadgroup]]) {

  using sort_kernel =

      KernelMergeSort<T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD>;

  using ValT = typename sort_kernel::ValT;

  using IdxT = typename sort_kernel::IdxT;


  auto in_block_idx = elem_to_loc(tid.y, nc_shape, in_nc_strides, nc_dim);

  auto out_block_idx = elem_to_loc(tid.y, nc_shape, out_nc_strides, nc_dim);

  inp += in_block_idx;

  out += out_block_idx;


  if (ARG_SORT) {

    threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];

    threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];

    sort_kernel::block_sort(

        inp,

        out,

        size_sorted_axis,

        in_stride_sorted_axis,

        out_stride_sorted_axis,

        zero_helper,

        zero_helper,

        tgp_vals,

        tgp_idxs,

        tid,

        lid);

  } else {

    threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];

    sort_kernel::block_sort(

        inp,

        out,

        size_sorted_axis,

        in_stride_sorted_axis,

        out_stride_sorted_axis,

        zero_helper,

        zero_helper,

        tgp_vals,

        nullptr,

        tid,

        lid);

  }

}


template <

    typename ValT,

    typename IdxT,

    bool ARG_SORT,

    short BLOCK_THREADS,

    short N_PER_THREAD,

    typename CompareOp = LessThan<ValT>>


struct KernelMultiBlockMergeSort {

  using block_merge_sort_t = BlockMergeSort<

      ValT,

      IdxT,

      ARG_SORT,

      BLOCK_THREADS,

      N_PER_THREAD,

      CompareOp>;


  MLX_MTL_CONST short N_PER_BLOCK = BLOCK_THREADS * N_PER_THREAD;


  static METAL_FUNC void block_sort(

      const device ValT* inp,

      device ValT* out_vals,

      device IdxT* out_idxs,

      const constant int& size_sorted_axis,

      const constant int& stride_sorted_axis,

      threadgroup ValT* tgp_vals,

      threadgroup IdxT* tgp_idxs,

      uint3 tid [[threadgroup_position_in_grid]],

      uint3 lid [[thread_position_in_threadgroup]]) {

    // tid.y tells us the segment index

    int base_idx = tid.x * N_PER_BLOCK;


    // Copy into threadgroup memory

    for (short i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {

      int idx = base_idx + i;

      tgp_vals[i] = idx < size_sorted_axis ? inp[idx * stride_sorted_axis]

                                           : ValT(CompareOp::init);

      tgp_idxs[i] = idx;

    }


    // Sort elements within the block

    threadgroup_barrier(mem_flags::mem_threadgroup);


    block_merge_sort_t::sort(tgp_vals, tgp_idxs, size_sorted_axis, lid);


    threadgroup_barrier(mem_flags::mem_threadgroup);


    // Write output

    for (int i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {

      int idx = base_idx + i;

      if (idx < size_sorted_axis) {

        out_vals[idx] = tgp_vals[i];

        out_idxs[idx] = tgp_idxs[i];

      }

    }

  }


  static METAL_FUNC int merge_partition(

      const device ValT* As,

      const device ValT* Bs,

      int A_sz,

      int B_sz,

      int sort_md) {

    CompareOp op;


    int A_st = max(0, sort_md - B_sz);

    int A_ed = min(sort_md, A_sz);


    while (A_st < A_ed) {

      int md = A_st + (A_ed - A_st) / 2;

      auto a = As[md];

      auto b = Bs[sort_md - 1 - md];


      if (op(b, a)) {

        A_ed = md;

      } else {

        A_st = md + 1;

      }

    }


    return A_ed;

  }


};


template <

    typename ValT,

    typename IdxT,

    bool ARG_SORT,

    short BLOCK_THREADS,

    short N_PER_THREAD>


[[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void mb_block_sort(

    const device ValT* inp [[buffer(0)]],

    device ValT* out_vals [[buffer(1)]],

    device IdxT* out_idxs [[buffer(2)]],

    const constant int& size_sorted_axis [[buffer(3)]],

    const constant int& stride_sorted_axis [[buffer(4)]],

    const constant int& nc_dim [[buffer(5)]],

    const constant int* nc_shape [[buffer(6)]],

    const constant int64_t* nc_strides [[buffer(7)]],

    uint3 tid [[threadgroup_position_in_grid]],

    uint3 lid [[thread_position_in_threadgroup]]) {

  using sort_kernel = KernelMultiBlockMergeSort<

      ValT,

      IdxT,

      ARG_SORT,

      BLOCK_THREADS,

      N_PER_THREAD>;


  auto block_idx = elem_to_loc(tid.y, nc_shape, nc_strides, nc_dim);

  inp += block_idx;

  out_vals += tid.y * size_sorted_axis;

  out_idxs += tid.y * size_sorted_axis;


  threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];

  threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];


  sort_kernel::block_sort(

      inp,

      out_vals,

      out_idxs,

      size_sorted_axis,

      stride_sorted_axis,

      tgp_vals,

      tgp_idxs,

      tid,

      lid);

}


template <

    typename ValT,

    typename IdxT,

    bool ARG_SORT,

    short BLOCK_THREADS,

    short N_PER_THREAD>


[[kernel]] void mb_block_partition(

    device IdxT* block_partitions [[buffer(0)]],

    const device ValT* dev_vals [[buffer(1)]],

    const device IdxT* dev_idxs [[buffer(2)]],

    const constant int& size_sorted_axis [[buffer(3)]],

    const constant int& merge_tiles [[buffer(4)]],

    const constant int& n_blocks [[buffer(5)]],

    uint3 tid [[threadgroup_position_in_grid]],

    uint3 lid [[thread_position_in_threadgroup]],

    uint3 tgp_dims [[threads_per_threadgroup]]) {

  using sort_kernel = KernelMultiBlockMergeSort<

      ValT,

      IdxT,

      ARG_SORT,

      BLOCK_THREADS,

      N_PER_THREAD>;


  block_partitions += tid.y * tgp_dims.x;

  dev_vals += tid.y * size_sorted_axis;

  dev_idxs += tid.y * size_sorted_axis;


  for (int i = lid.x; i <= n_blocks; i += tgp_dims.x) {

    // Find location in merge step

    int merge_group = i / merge_tiles;

    int merge_lane = i % merge_tiles;


    int sort_sz = sort_kernel::N_PER_BLOCK * merge_tiles;

    int sort_st = sort_kernel::N_PER_BLOCK * merge_tiles * merge_group;


    int A_st = min(size_sorted_axis, sort_st);

    int A_ed = min(size_sorted_axis, sort_st + sort_sz / 2);

    int B_st = A_ed;

    int B_ed = min(size_sorted_axis, B_st + sort_sz / 2);


    int partition_at = min(B_ed - A_st, sort_kernel::N_PER_BLOCK * merge_lane);

    int partition = sort_kernel::merge_partition(

        dev_vals + A_st,

        dev_vals + B_st,

        A_ed - A_st,

        B_ed - B_st,

        partition_at);


    block_partitions[i] = A_st + partition;

  }

}


template <

    typename ValT,

    typename IdxT,

    bool ARG_SORT,

    short BLOCK_THREADS,

    short N_PER_THREAD,

    typename CompareOp = LessThan<ValT>>

[[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void


mb_block_merge(

    const device IdxT* block_partitions [[buffer(0)]],

    const device ValT* dev_vals_in [[buffer(1)]],

    const device IdxT* dev_idxs_in [[buffer(2)]],

    device ValT* dev_vals_out [[buffer(3)]],

    device IdxT* dev_idxs_out [[buffer(4)]],

    const constant int& size_sorted_axis [[buffer(5)]],

    const constant int& merge_tiles [[buffer(6)]],

    const constant int& num_tiles [[buffer(7)]],

    uint3 tid [[threadgroup_position_in_grid]],

    uint3 lid [[thread_position_in_threadgroup]]) {

  using sort_kernel = KernelMultiBlockMergeSort<

      ValT,

      IdxT,

      ARG_SORT,

      BLOCK_THREADS,

      N_PER_THREAD,

      CompareOp>;


  using block_sort_t = typename sort_kernel::block_merge_sort_t;


  block_partitions += tid.y * (num_tiles + 1);

  dev_vals_in += tid.y * size_sorted_axis;

  dev_idxs_in += tid.y * size_sorted_axis;

  dev_vals_out += tid.y * size_sorted_axis;

  dev_idxs_out += tid.y * size_sorted_axis;


  int block_idx = tid.x;

  int merge_group = block_idx / merge_tiles;

  int sort_st = sort_kernel::N_PER_BLOCK * merge_tiles * merge_group;

  int sort_sz = sort_kernel::N_PER_BLOCK * merge_tiles;

  int sort_md = sort_kernel::N_PER_BLOCK * block_idx - sort_st;


  int A_st = block_partitions[block_idx + 0];

  int A_ed = block_partitions[block_idx + 1];

  int B_st = min(size_sorted_axis, 2 * sort_st + sort_sz / 2 + sort_md - A_st);

  int B_ed = min(

      size_sorted_axis,

      2 * sort_st + sort_sz / 2 + sort_md + sort_kernel::N_PER_BLOCK - A_ed);


  if ((block_idx % merge_tiles) == merge_tiles - 1) {

    A_ed = min(size_sorted_axis, sort_st + sort_sz / 2);

    B_ed = min(size_sorted_axis, sort_st + sort_sz);

  }


  int A_sz = A_ed - A_st;

  int B_sz = B_ed - B_st;


  // Load from global memory

  thread ValT thread_vals[N_PER_THREAD];

  thread IdxT thread_idxs[N_PER_THREAD];

  for (int i = 0; i < N_PER_THREAD; i++) {

    int idx = BLOCK_THREADS * i + lid.x;

    if (idx < (A_sz + B_sz)) {

      thread_vals[i] = (idx < A_sz) ? dev_vals_in[A_st + idx]

                                    : dev_vals_in[B_st + idx - A_sz];

      thread_idxs[i] = (idx < A_sz) ? dev_idxs_in[A_st + idx]

                                    : dev_idxs_in[B_st + idx - A_sz];

    } else {

      thread_vals[i] = CompareOp::init;

      thread_idxs[i] = 0;

    }

  }


  // Write to shared memory

  threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];

  threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];

  threadgroup_barrier(mem_flags::mem_threadgroup);

  for (int i = 0; i < N_PER_THREAD; i++) {

    int idx = BLOCK_THREADS * i + lid.x;

    tgp_vals[idx] = thread_vals[i];

    tgp_idxs[idx] = thread_idxs[i];

  }

  threadgroup_barrier(mem_flags::mem_threadgroup);


  // Merge

  int sort_md_local = min(A_sz + B_sz, N_PER_THREAD * int(lid.x));


  int A_st_local = block_sort_t::merge_partition(

      tgp_vals, tgp_vals + A_sz, A_sz, B_sz, sort_md_local);

  int A_ed_local = A_sz;


  int B_st_local = sort_md_local - A_st_local;

  int B_ed_local = B_sz;


  int A_sz_local = A_ed_local - A_st_local;

  int B_sz_local = B_ed_local - B_st_local;


  // Do merge

  block_sort_t::merge_step(

      tgp_vals + A_st_local,

      tgp_vals + A_ed_local + B_st_local,

      tgp_idxs + A_st_local,

      tgp_idxs + A_ed_local + B_st_local,

      A_sz_local,

      B_sz_local,

      thread_vals,

      thread_idxs);


  threadgroup_barrier(mem_flags::mem_threadgroup);

  for (int i = 0; i < N_PER_THREAD; ++i) {

    int idx = lid.x * N_PER_THREAD;

    tgp_vals[idx + i] = thread_vals[i];

    tgp_idxs[idx + i] = thread_idxs[i];

  }


  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Write output

  int base_idx = tid.x * sort_kernel::N_PER_BLOCK;

  for (int i = lid.x; i < sort_kernel::N_PER_BLOCK; i += BLOCK_THREADS) {

    int idx = base_idx + i;

    if (idx < size_sorted_axis) {

      dev_vals_out[idx] = tgp_vals[i];

      dev_idxs_out[idx] = tgp_idxs[i];

    }

  }

}


elem_to_loc
METAL_FUNC IdxT elem_to_loc(IdxT elem, constant const int *shape, constant const int64_t *strides, int ndim)
Definition utils.h:93

metal
Definition bf16_math.h:226

metal::min
METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)
Definition bf16_math.h:232

metal::max
METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)
Definition bf16_math.h:232

MLX_MTL_CONST
#define MLX_MTL_CONST
Definition sort.h:3

thread_swap
METAL_FUNC void thread_swap(thread T &a, thread T &b)
Definition sort.h:16

mb_block_partition
void mb_block_partition(device IdxT *block_partitions, const device ValT *dev_vals, const device IdxT *dev_idxs, const constant int &size_sorted_axis, const constant int &merge_tiles, const constant int &n_blocks, uint3 tid, uint3 lid, uint3 tgp_dims)
Definition sort.h:524

block_sort
void block_sort(const device T *inp, device U *out, const constant int &size_sorted_axis, const constant int &in_stride_sorted_axis, const constant int &out_stride_sorted_axis, const constant int &in_stride_segment_axis, const constant int &out_stride_segment_axis, uint3 tid, uint3 lid)
Definition sort.h:282

mb_block_merge
void mb_block_merge(const device IdxT *block_partitions, const device ValT *dev_vals_in, const device IdxT *dev_idxs_in, device ValT *dev_vals_out, device IdxT *dev_idxs_out, const constant int &size_sorted_axis, const constant int &merge_tiles, const constant int &num_tiles, uint3 tid, uint3 lid)
Definition sort.h:578

zero_helper
constant constexpr const int zero_helper
Definition sort.h:329

mb_block_sort
void mb_block_sort(const device ValT *inp, device ValT *out_vals, device IdxT *out_idxs, const constant int &size_sorted_axis, const constant int &stride_sorted_axis, const constant int &nc_dim, const constant int *nc_shape, const constant int64_t *nc_strides, uint3 tid, uint3 lid)
Definition sort.h:480

block_sort_nc
void block_sort_nc(const device T *inp, device U *out, const constant int &size_sorted_axis, const constant int &in_stride_sorted_axis, const constant int &out_stride_sorted_axis, const constant int &nc_dim, const constant int *nc_shape, const constant int64_t *in_nc_strides, const constant int64_t *out_nc_strides, uint3 tid, uint3 lid)
Definition sort.h:337

MLX_MTL_LOOP_UNROLL
#define MLX_MTL_LOOP_UNROLL
Definition sort.h:4

BlockMergeSort
Definition sort.h:66

BlockMergeSort::merge_step
static METAL_FUNC void merge_step(const threadgroup ValT *As, const threadgroup ValT *Bs, const threadgroup IdxT *As_idx, const threadgroup IdxT *Bs_idx, short A_sz, short B_sz, thread ValT(&vals)[N_PER_THREAD], thread IdxT(&idxs)[N_PER_THREAD])
Definition sort.h:95

BlockMergeSort::sort
static METAL_FUNC void sort(threadgroup ValT *tgp_vals, threadgroup IdxT *tgp_idxs, int size_sorted_axis, uint3 lid)
Definition sort.h:121

BlockMergeSort::merge_partition
static METAL_FUNC int merge_partition(const threadgroup ValT *As, const threadgroup ValT *Bs, short A_sz, short B_sz, short sort_md)
Definition sort.h:69

BlockMergeSort::thread_sort_t
ThreadSort< ValT, IdxT, ARG_SORT, N_PER_THREAD, CompareOp > thread_sort_t
Definition sort.h:67

KernelMergeSort
Definition sort.h:220

KernelMergeSort::block_merge_sort_t
BlockMergeSort< ValT, IdxT, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp > block_merge_sort_t
Definition sort.h:223

KernelMergeSort::IdxT
uint IdxT
Definition sort.h:222

KernelMergeSort::block_sort
static METAL_FUNC void block_sort(const device T *inp, device U *out, const constant int &size_sorted_axis, const constant int &in_stride_sorted_axis, const constant int &out_stride_sorted_axis, const constant int &in_stride_segment_axis, const constant int &out_stride_segment_axis, threadgroup ValT *tgp_vals, threadgroup IdxT *tgp_idxs, uint3 tid, uint3 lid)
Definition sort.h:233

KernelMergeSort::ValT
T ValT
Definition sort.h:221

KernelMergeSort::N_PER_BLOCK
static constant constexpr const short N_PER_BLOCK
Definition sort.h:231

KernelMultiBlockMergeSort
Definition sort.h:398

KernelMultiBlockMergeSort::N_PER_BLOCK
static constant constexpr const short N_PER_BLOCK
Definition sort.h:407

KernelMultiBlockMergeSort::block_sort
static METAL_FUNC void block_sort(const device ValT *inp, device ValT *out_vals, device IdxT *out_idxs, const constant int &size_sorted_axis, const constant int &stride_sorted_axis, threadgroup ValT *tgp_vals, threadgroup IdxT *tgp_idxs, uint3 tid, uint3 lid)
Definition sort.h:409

KernelMultiBlockMergeSort::merge_partition
static METAL_FUNC int merge_partition(const device ValT *As, const device ValT *Bs, int A_sz, int B_sz, int sort_md)
Definition sort.h:447

KernelMultiBlockMergeSort::block_merge_sort_t
BlockMergeSort< ValT, IdxT, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp > block_merge_sort_t
Definition sort.h:399

LessThan
Definition sort.h:23

LessThan::operator()
METAL_FUNC bool operator()(T a, T b)
Definition sort.h:26

LessThan::init
static constexpr constant T init
Definition sort.h:24

Limits::max
static const constant U max
Definition utils.h:24

ThreadSort
Definition sort.h:37

ThreadSort::sort
static METAL_FUNC void sort(thread ValT(&vals)[N_PER_THREAD], thread IdxT(&idxs)[N_PER_THREAD])
Definition sort.h:38