mlx/mlx/backend/metal/kernels/gather.h

// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/indexing.h"

template <typename T, typename IdxT, int NIDX, int IDX_NDIM, typename LocT>
METAL_FUNC void gather_impl(
    const device T* src [[buffer(0)]],
    device T* out [[buffer(1)]],
    const constant int* src_shape [[buffer(2)]],
    const constant size_t* src_strides [[buffer(3)]],
    const constant size_t& src_ndim [[buffer(4)]],
    const constant int* slice_sizes [[buffer(5)]],
    const constant int* axes [[buffer(6)]],
    const thread Indices<IdxT, NIDX>& indices,
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  LocT src_idx = 0;
  for (int i = 0; i < NIDX; ++i) {
    LocT idx_loc;
    if (IDX_NDIM == 0) {
      idx_loc = 0;
    } else if (IDX_NDIM == 1) {
      idx_loc = index.x * static_cast<LocT>(indices.strides[indices.ndim * i]);
    } else {
      idx_loc = index.x * static_cast<LocT>(indices.strides[indices.ndim * i]);
      idx_loc += indices.row_contiguous[i]
          ? index.y
          : elem_to_loc<size_t, LocT>(
                index.y,
                &indices.shapes[indices.ndim * i + 1],
                &indices.strides[indices.ndim * i + 1],
                indices.ndim - 1);
    }
    auto ax = axes[i];
    auto idx_val = offset_neg_idx(indices.buffers[i][idx_loc], src_shape[ax]);
    src_idx += static_cast<LocT>(idx_val) * static_cast<LocT>(src_strides[ax]);
  }

  auto src_offset =
      elem_to_loc<size_t, LocT>(index.z, slice_sizes, src_strides, src_ndim);

  LocT out_idx = index.z;
  if (IDX_NDIM == 1) {
    out_idx += static_cast<LocT>(grid_dim.z) * index.x;
  } else if (IDX_NDIM >= 2) {
    out_idx += grid_dim.z * (index.x * static_cast<LocT>(grid_dim.y) + index.y);
  }
  out[out_idx] = src[src_offset + src_idx];
}
JIT compile option for binary minimization (#1091) * try cpp 20 for compile * unary, binary, ternary in jit * nits * fix gather/scatter * fix rebase * reorg compile * add ternary to compile * jit copy * jit compile flag * fix build * use linked function for ternary * some nits * docs + circle min size build * docs + circle min size build * fix extension * fix no cpu build * improve includes 2024-05-23 03:57:13 +08:00			`// Copyright © 2024 Apple Inc.`

			`#pragma once`

			`#include "mlx/backend/metal/kernels/indexing.h"`

Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`template <typename T, typename IdxT, int NIDX, int IDX_NDIM, typename LocT>`
JIT compile option for binary minimization (#1091) * try cpp 20 for compile * unary, binary, ternary in jit * nits * fix gather/scatter * fix rebase * reorg compile * add ternary to compile * jit copy * jit compile flag * fix build * use linked function for ternary * some nits * docs + circle min size build * docs + circle min size build * fix extension * fix no cpu build * improve includes 2024-05-23 03:57:13 +08:00			`METAL_FUNC void gather_impl(`
			`const device T* src [[buffer(0)]],`
			`device T* out [[buffer(1)]],`
			`const constant int* src_shape [[buffer(2)]],`
			`const constant size_t* src_strides [[buffer(3)]],`
			`const constant size_t& src_ndim [[buffer(4)]],`
			`const constant int* slice_sizes [[buffer(5)]],`
			`const constant int* axes [[buffer(6)]],`
			`const thread Indices<IdxT, NIDX>& indices,`
2d gather specialization (#1339) 2024-08-23 01:48:24 +08:00			`uint3 index [[thread_position_in_grid]],`
			`uint3 grid_dim [[threads_per_grid]]) {`
Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`LocT src_idx = 0;`
JIT compile option for binary minimization (#1091) * try cpp 20 for compile * unary, binary, ternary in jit * nits * fix gather/scatter * fix rebase * reorg compile * add ternary to compile * jit copy * jit compile flag * fix build * use linked function for ternary * some nits * docs + circle min size build * docs + circle min size build * fix extension * fix no cpu build * improve includes 2024-05-23 03:57:13 +08:00			`for (int i = 0; i < NIDX; ++i) {`
Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`LocT idx_loc;`
JIT compile option for binary minimization (#1091) * try cpp 20 for compile * unary, binary, ternary in jit * nits * fix gather/scatter * fix rebase * reorg compile * add ternary to compile * jit copy * jit compile flag * fix build * use linked function for ternary * some nits * docs + circle min size build * docs + circle min size build * fix extension * fix no cpu build * improve includes 2024-05-23 03:57:13 +08:00			`if (IDX_NDIM == 0) {`
			`idx_loc = 0;`
			`} else if (IDX_NDIM == 1) {`
Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`idx_loc = index.x * static_cast<LocT>(indices.strides[indices.ndim * i]);`
JIT compile option for binary minimization (#1091) * try cpp 20 for compile * unary, binary, ternary in jit * nits * fix gather/scatter * fix rebase * reorg compile * add ternary to compile * jit copy * jit compile flag * fix build * use linked function for ternary * some nits * docs + circle min size build * docs + circle min size build * fix extension * fix no cpu build * improve includes 2024-05-23 03:57:13 +08:00			`} else {`
Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`idx_loc = index.x * static_cast<LocT>(indices.strides[indices.ndim * i]);`
improvements to scatter / gather (#1541) 2024-10-31 10:30:54 +08:00			`idx_loc += indices.row_contiguous[i]`
			`? index.y`
Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`: elem_to_loc<size_t, LocT>(`
improvements to scatter / gather (#1541) 2024-10-31 10:30:54 +08:00			`index.y,`
			`&indices.shapes[indices.ndim * i + 1],`
			`&indices.strides[indices.ndim * i + 1],`
			`indices.ndim - 1);`
JIT compile option for binary minimization (#1091) * try cpp 20 for compile * unary, binary, ternary in jit * nits * fix gather/scatter * fix rebase * reorg compile * add ternary to compile * jit copy * jit compile flag * fix build * use linked function for ternary * some nits * docs + circle min size build * docs + circle min size build * fix extension * fix no cpu build * improve includes 2024-05-23 03:57:13 +08:00			`}`
			`auto ax = axes[i];`
			`auto idx_val = offset_neg_idx(indices.buffers[i][idx_loc], src_shape[ax]);`
Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`src_idx += static_cast<LocT>(idx_val) * static_cast<LocT>(src_strides[ax]);`
JIT compile option for binary minimization (#1091) * try cpp 20 for compile * unary, binary, ternary in jit * nits * fix gather/scatter * fix rebase * reorg compile * add ternary to compile * jit copy * jit compile flag * fix build * use linked function for ternary * some nits * docs + circle min size build * docs + circle min size build * fix extension * fix no cpu build * improve includes 2024-05-23 03:57:13 +08:00			`}`

Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`auto src_offset =`
			`elem_to_loc<size_t, LocT>(index.z, slice_sizes, src_strides, src_ndim);`
JIT compile option for binary minimization (#1091) * try cpp 20 for compile * unary, binary, ternary in jit * nits * fix gather/scatter * fix rebase * reorg compile * add ternary to compile * jit copy * jit compile flag * fix build * use linked function for ternary * some nits * docs + circle min size build * docs + circle min size build * fix extension * fix no cpu build * improve includes 2024-05-23 03:57:13 +08:00
Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`LocT out_idx = index.z;`
2d gather specialization (#1339) 2024-08-23 01:48:24 +08:00			`if (IDX_NDIM == 1) {`
Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`out_idx += static_cast<LocT>(grid_dim.z) * index.x;`
2d gather specialization (#1339) 2024-08-23 01:48:24 +08:00			`} else if (IDX_NDIM >= 2) {`
Faster indexing math in a few kernels (#1589) * wip: faster compiled kernels * faster general unary with uint specialization * index type in compiled, unary, binary, ternary, copy * fix jit * jit fix * specialize gather + scatter * nit in docs 2024-11-19 11:52:00 +08:00			`out_idx += grid_dim.z * (index.x * static_cast<LocT>(grid_dim.y) + index.y);`
2d gather specialization (#1339) 2024-08-23 01:48:24 +08:00			`}`
JIT compile option for binary minimization (#1091) * try cpp 20 for compile * unary, binary, ternary in jit * nits * fix gather/scatter * fix rebase * reorg compile * add ternary to compile * jit copy * jit compile flag * fix build * use linked function for ternary * some nits * docs + circle min size build * docs + circle min size build * fix extension * fix no cpu build * improve includes 2024-05-23 03:57:13 +08:00			`out[out_idx] = src[src_offset + src_idx];`
			`}`