Faster gather and scatter. (#682)

Reduce unnecessary integer ops, especially since there kernels are integer bound. Increase number of iterations for benchmarks for better smoothing. Github Issue #506 Co-authored-by: Vijay Krishnamoorthy <vijay_krish@apple.com>
2025-09-18 18:28:12 +08:00 · 2024-02-13 17:47:41 -08:00
parent be6e9d6a9f
commit 2fdc2462c3
2 changed files with 3 additions and 3 deletions
--- a/mlx/backend/metal/kernels/utils.h
+++ b/mlx/backend/metal/kernels/utils.h
@@ -71,7 +71,7 @@ inline size_t elem_to_loc(
    device const size_t* strides,
    int ndim) {
  size_t loc = 0;
-  for (int i = ndim - 1; i >= 0; --i) {
+  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
    loc += (elem % shape[i]) * strides[i];
    elem /= shape[i];
  }
@@ -84,7 +84,7 @@ inline size_t elem_to_loc(
    constant const size_t* strides,
    int ndim) {
  size_t loc = 0;
-  for (int i = ndim - 1; i >= 0; --i) {
+  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
    loc += (elem % shape[i]) * strides[i];
    elem /= shape[i];
  }